Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to GraniteGuardian metric,, safety evals cleanups #1690

Merged
merged 8 commits into from
Mar 19, 2025
16 changes: 6 additions & 10 deletions prepare/cards/safety/mlcommons_ailuminate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,20 @@
TaskCard,
)
from unitxt.loaders import LoadCSV
from unitxt.operators import Rename
from unitxt.templates import TemplatesDict
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadCSV(files={ "test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"}, # pragma: allowlist secret
data_classification_policy=["public"]),
preprocess_steps=[
Rename(field_to_field={"prompt_text": "input",
"hazard": "label" }),
],
loader=LoadCSV(
files={"test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"}, # pragma: allowlist secret
data_classification_policy=["public"]),
task=Task(
input_fields={"release_prompt_id": str, "input": str, "label": str, "persona": str, "locale": str, "prompt_hash": str},
reference_fields={"label": str},
input_fields={"release_prompt_id": str, "prompt_text": str, "hazard": str, "persona": str, "locale": str, "prompt_hash": str},
reference_fields={},
prediction_type=str,
metrics=["metrics.llm_as_judge.safety.llamaguard"]
),
templates=TemplatesDict({"default": InputOutputTemplate(input_format="{input}\n", output_format="")}),
templates=TemplatesDict({"default": InputOutputTemplate(input_format="{prompt_text}\n", output_format="")}),

__description__="The DEMO prompt library of the AILuminate 1.0 prompt dataset, created by MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that cover 12 different hazard categories. The dataset is a mix of adversarial and non-adversarial prompts. This dataset is a 10% subset of the full AILuminate training dataset.",
__tags__={
Expand Down
21 changes: 5 additions & 16 deletions prepare/cards/safety/simple_safety_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,23 @@
Task,
TaskCard,
)
from unitxt.operators import (
Rename,
Set,
)
from unitxt.templates import InputOutputTemplate
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadHF(
path="Bertievidgen/SimpleSafetyTests", data_classification_policy=["public"]
),
preprocess_steps=[
Rename(field_to_field={"prompt": "input"}),
Set(
fields={
"output": "",
}
),
],
task=Task(
input_fields={"input": str, "id": str, "harm_area": str, "category": str},
reference_fields={"output": str},
input_fields={"prompt": str, "id": str, "harm_area": str, "category": str},
reference_fields={},
prediction_type=str,
metrics=[
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction,score_prefix=graniteguardian_]",
"metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
],
),
templates=["templates.generation.empty"],
templates={"default": InputOutputTemplate(input_format="{prompt}", output_format="")},
__description__="100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
__tags__={
"languages": ["english"],
Expand Down
19 changes: 4 additions & 15 deletions src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,17 @@
"public"
]
},
"preprocess_steps": [
{
"__type__": "rename",
"field_to_field": {
"prompt_text": "input",
"hazard": "label"
}
}
],
"task": {
"__type__": "task",
"input_fields": {
"release_prompt_id": "str",
"input": "str",
"label": "str",
"prompt_text": "str",
"hazard": "str",
"persona": "str",
"locale": "str",
"prompt_hash": "str"
},
"reference_fields": {
"label": "str"
},
"reference_fields": {},
"prediction_type": "str",
"metrics": [
"metrics.llm_as_judge.safety.llamaguard"
Expand All @@ -41,7 +30,7 @@
"items": {
"default": {
"__type__": "input_output_template",
"input_format": "{input}\n",
"input_format": "{prompt_text}\n",
"output_format": ""
}
}
Expand Down
32 changes: 10 additions & 22 deletions src/unitxt/catalog/cards/safety/simple_safety_tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,28 @@
"public"
]
},
"preprocess_steps": [
{
"__type__": "rename",
"field_to_field": {
"prompt": "input"
}
},
{
"__type__": "set",
"fields": {
"output": ""
}
}
],
"task": {
"__type__": "task",
"input_fields": {
"input": "str",
"prompt": "str",
"id": "str",
"harm_area": "str",
"category": "str"
},
"reference_fields": {
"output": "str"
},
"reference_fields": {},
"prediction_type": "str",
"metrics": [
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction,score_prefix=graniteguardian_]",
"metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
]
},
"templates": [
"templates.generation.empty"
],
"templates": {
"default": {
"__type__": "input_output_template",
"input_format": "{prompt}",
"output_format": ""
}
},
"__description__": "100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
"__tags__": {
"languages": [
Expand Down
8 changes: 7 additions & 1 deletion src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6101,6 +6101,9 @@ def get_prompt(self, messages):
)

def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
# TODO replace with logic inside verify_granite_guardian_config and process_input_fields
task_data["prediction"] = prediction

self.verify_granite_guardian_config(task_data)
self.set_main_score()

Expand All @@ -6114,7 +6117,10 @@ def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> di
)
messages = self.process_input_fields(task_data)
prompt = self.get_prompt(messages)
result = self.inference_engine.infer_log_probs([{"source": prompt}])
data_classification_policy = task_data.get("metadata", {}).get("data_classification_policy")

result = self.inference_engine.infer_log_probs([{"source": prompt, "data_classification_policy": data_classification_policy}])

generated_tokens_list = result[0]
label, prob_of_risk = self.parse_output(generated_tokens_list)
confidence_score = (
Expand Down
Loading