Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address all current pylint and mypy warnings #40103

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ---------------------------------------------------------

import math
from typing import List, Callable, Any
from typing import List, Callable, Any, Sequence

from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget

Expand Down Expand Up @@ -32,7 +32,7 @@ def list_mean(lst: List[float]) -> float:
return list_sum(lst) / len(lst)


def list_mean_nan_safe(lst: List[float]) -> float:
def list_mean_nan_safe(lst: Sequence[float]) -> float:
"""Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.

:param lst: A list of floats.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"


def get_formatted_template(data: dict, annotation_task: str) -> str:
"""Given the task and input data, produce a formatted string that will serve as the main
payload for the RAI service. Requires specific per-task logic.
Expand All @@ -66,16 +67,13 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
}
return json.dumps(as_dict)
if annotation_task == Tasks.CODE_VULNERABILITY:
as_dict = {
"context": data.get("query", ""),
"completion": data.get("response", "")
}
as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
return json.dumps(as_dict)
if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
as_dict = {
"query": data.get("query", ""),
"response": data.get("response", ""),
"context": data.get("context", "")
"context": data.get("context", ""),
}
return json.dumps(as_dict)
as_dict = {
Expand Down Expand Up @@ -267,6 +265,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
sleep_time = RAIService.SLEEP_TIME**request_count
await asyncio.sleep(sleep_time)


def parse_response( # pylint: disable=too-many-branches,too-many-statements
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
) -> Dict[str, Union[str, float]]:
Expand Down Expand Up @@ -295,10 +294,13 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
result = {}
if not batch_response or len(batch_response[0]) == 0:
return {}
if metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]:
batch_response[0] = {
EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
}
if (
metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
):
batch_response[0] = {
EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
}
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
for pm_metric_name in pm_metric_names:
Expand Down Expand Up @@ -334,7 +336,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
result[metric_display_name + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
)
if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
if metric_name in (EvaluationMetrics.CODE_VULNERABILITY, EvaluationMetrics.UNGROUNDED_ATTRIBUTES):
# Add all attributes under the details.
details = {}
for key, value in parsed_response.items():
Expand All @@ -345,7 +347,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)


def _parse_content_harm_response(
def _parse_content_harm_response( # pylint: disable=too-many-branches,too-many-statements
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
) -> Dict[str, Union[str, float]]:
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
import os
import re
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypedDict, TypeVar, Union, cast

import pandas as pd
from promptflow._sdk._constants import LINE_NUMBER
Expand Down Expand Up @@ -161,44 +161,48 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
metric_name = col.split(".")[1]
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
label_cols.append(col)
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
details_cols = col

label_df = df[label_cols]
defect_rates = {}
for col in label_df.columns:
defect_rate_name = col.replace("_label", "_defect_rate")
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
col_with_boolean_values = cast(Sequence[float], pd.to_numeric(label_df[col], errors="coerce"))
try:
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
except EvaluationException: # only exception that can be cause is all NaN values
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
LOGGER.warning(msg)

if details_cols:
details_df = df[details_cols]
detail_defect_rates = {}
detail_defect_rates: Dict[str, float] = {}

for key, value in details_df.items():
_process_rows(value, detail_defect_rates)

for key, value in detail_defect_rates.items():
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
col_with_boolean_values = cast(Sequence[float], pd.to_numeric(value, errors="coerce"))
try:
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
list_mean_nan_safe(col_with_boolean_values), 2
)
except EvaluationException: # only exception that can be cause is all NaN values
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
LOGGER.warning(msg)

return label_cols, defect_rates


def _process_rows(row, detail_defect_rates):
for key, value in row.items():
if key not in detail_defect_rates:
detail_defect_rates[key] = []
detail_defect_rates[key].append(value)
return detail_defect_rates


def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
"""Aggregate metrics from the evaluation results.
On top of naively calculating the mean of most metrics, this function also identifies certain columns
Expand Down Expand Up @@ -478,7 +482,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
def _apply_target_to_data(
target: Callable,
data: Union[str, os.PathLike],
batch_client: TClient,
batch_client: ProxyClient,
initial_data: pd.DataFrame,
evaluation_name: Optional[str] = None,
**kwargs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
"bleu_score": score,
}

@overload # type: ignore
@overload
def __call__(self, *, response: str, ground_truth: str):
"""
Evaluate the BLEU score between the response and the ground truth.
Expand All @@ -73,20 +73,33 @@ def __call__(self, *, response: str, ground_truth: str):
:return: The BLEU score.
:rtype: Dict[str, float]
"""
...

@overload
def __call__(
self,
*args,
**kwargs,
):
"""
Evaluate the BLEU score between the response and the ground truth.

:param args: The arguments to pass to the evaluation function.
:type args: Any
:rtype: Dict[str, float]"""
...

@override
def __call__( # pylint: disable=docstring-missing-param
def __call__(
self,
*args,
**kwargs,
):
"""
Evaluate the BLEU score between the response and the ground truth.

:keyword response: The response to be evaluated.
:paramtype response: str
:keyword ground_truth: The ground truth to be compared against.
:paramtype ground_truth: str
:param args: The arguments to pass to the evaluation function.
:type args: Any
:return: The BLEU score.
:rtype: Dict[str, float]
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing_extensions import overload, override
from typing import Dict, Union
from typing_extensions import overload, override

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase


# cspell:ignore ssrf, vuln
@experimental
class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
"""
Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
where query represents the user query or code before the completion, and response represents the
code recommended by the assistant.

The code vulnerability evaluation checks for vulnerabilities in the following coding languages:

- Python
- Java
- C++
Expand All @@ -26,7 +28,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
- SQL

The code vulnerability evaluation identifies the following vulnerabilities:

- path-injection
- sql-injection
- code-injection
Expand Down Expand Up @@ -85,13 +87,13 @@ def __init__(
credential=credential,
)

@overload
@overload # type: ignore[override]
def __call__(
self,
*,
query: str,
response: str,
) -> Dict[str, Union[str, float]]:
) -> Dict[str, Union[str, float]]:
"""Evaluate a given query/response pair for code vulnerability

:keyword query: The query to be evaluated.
Expand All @@ -101,20 +103,29 @@ def __call__(
:return: The code vulnerability label.
:rtype: Dict[str, Union[str, bool]]
"""
...

@overload
def __call__(self, *args, **kwargs):
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.

:param Any args: The arguments to pass to the evaluator.
:return: The code vulnerability label.
:rtype: Dict[str, Union[str, bool]]
"""
...

@override
def __call__( # pylint: disable=docstring-missing-param
def __call__(
self,
*args,
**kwargs,
):
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.

:keyword query: The query to be evaluated.
:paramtype query: Optional[str]
:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:param Any args: The arguments to pass to the evaluator.
:return: The code vulnerability label.
:rtype: Dict[str, Union[str, bool]]
"""

return super().__call__(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import os
from typing import Dict, Union, List
from typing import Dict, List, Optional, Union

from typing_extensions import overload, override

Expand Down Expand Up @@ -67,7 +67,7 @@ def __call__(
:rtype: Dict[str, float]
"""

@overload
@overload # type: ignore[override]
def __call__(
self,
*,
Expand All @@ -82,9 +82,27 @@ def __call__(
:return: The coherence score.
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
"""
...

@overload
def __call__(
self,
*,
query: str,
response: Optional[str] = None,
):
"""
Evaluate coherence for a query and response.

:keyword str query: The query to be evaluated.
:keyword Optional[str] response: The response to be evaluated.
:return: The relevance score.
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
"""
...

@override
def __call__( # pylint: disable=docstring-missing-param
def __call__(
self,
*args,
**kwargs,
Expand All @@ -93,14 +111,7 @@ def __call__( # pylint: disable=docstring-missing-param
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
turns, the evaluator will aggregate the results of each turn.

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages". Conversation turns are expected
to be dictionaries with keys "content" and "role".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:param Any args: The arguments to evaluate.
:return: The relevance score.
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
"""
Expand Down
Loading
Loading