Skip to content

Fixes to GraniteGuardian metric,, safety evals cleanups #1690

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions prepare/cards/safety/mlcommons_ailuminate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,20 @@
TaskCard,
)
from unitxt.loaders import LoadCSV
from unitxt.operators import Rename
from unitxt.templates import TemplatesDict
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadCSV(files={ "test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"}, # pragma: allowlist secret
data_classification_policy=["public"]),
preprocess_steps=[
Rename(field_to_field={"prompt_text": "input",
"hazard": "label" }),
],
loader=LoadCSV(
files={"test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"}, # pragma: allowlist secret
data_classification_policy=["public"]),
task=Task(
input_fields={"release_prompt_id": str, "input": str, "label": str, "persona": str, "locale": str, "prompt_hash": str},
reference_fields={"label": str},
input_fields={"release_prompt_id": str, "prompt_text": str, "hazard": str, "persona": str, "locale": str, "prompt_hash": str},
reference_fields={},
prediction_type=str,
metrics=["metrics.llm_as_judge.safety.llamaguard"]
),
templates=TemplatesDict({"default": InputOutputTemplate(input_format="{input}\n", output_format="")}),
templates=TemplatesDict({"default": InputOutputTemplate(input_format="{prompt_text}\n", output_format="")}),

__description__="The DEMO prompt library of the AILuminate 1.0 prompt dataset, created by MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that cover 12 different hazard categories. The dataset is a mix of adversarial and non-adversarial prompts. This dataset is a 10% subset of the full AILuminate training dataset.",
__tags__={
Expand Down
21 changes: 5 additions & 16 deletions prepare/cards/safety/simple_safety_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,23 @@
Task,
TaskCard,
)
from unitxt.operators import (
Rename,
Set,
)
from unitxt.templates import InputOutputTemplate
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadHF(
path="Bertievidgen/SimpleSafetyTests", data_classification_policy=["public"]
),
preprocess_steps=[
Rename(field_to_field={"prompt": "input"}),
Set(
fields={
"output": "",
}
),
],
task=Task(
input_fields={"input": str, "id": str, "harm_area": str, "category": str},
reference_fields={"output": str},
input_fields={"prompt": str, "id": str, "harm_area": str, "category": str},
reference_fields={},
prediction_type=str,
metrics=[
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction,score_prefix=graniteguardian_]",
"metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
],
),
templates=["templates.generation.empty"],
templates={"default": InputOutputTemplate(input_format="{prompt}", output_format="")},
__description__="100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
__tags__={
"languages": ["english"],
Expand Down
19 changes: 4 additions & 15 deletions src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,17 @@
"public"
]
},
"preprocess_steps": [
{
"__type__": "rename",
"field_to_field": {
"prompt_text": "input",
"hazard": "label"
}
}
],
"task": {
"__type__": "task",
"input_fields": {
"release_prompt_id": "str",
"input": "str",
"label": "str",
"prompt_text": "str",
"hazard": "str",
"persona": "str",
"locale": "str",
"prompt_hash": "str"
},
"reference_fields": {
"label": "str"
},
"reference_fields": {},
"prediction_type": "str",
"metrics": [
"metrics.llm_as_judge.safety.llamaguard"
Expand All @@ -41,7 +30,7 @@
"items": {
"default": {
"__type__": "input_output_template",
"input_format": "{input}\n",
"input_format": "{prompt_text}\n",
"output_format": ""
}
}
Expand Down
32 changes: 10 additions & 22 deletions src/unitxt/catalog/cards/safety/simple_safety_tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,28 @@
"public"
]
},
"preprocess_steps": [
{
"__type__": "rename",
"field_to_field": {
"prompt": "input"
}
},
{
"__type__": "set",
"fields": {
"output": ""
}
}
],
"task": {
"__type__": "task",
"input_fields": {
"input": "str",
"prompt": "str",
"id": "str",
"harm_area": "str",
"category": "str"
},
"reference_fields": {
"output": "str"
},
"reference_fields": {},
"prediction_type": "str",
"metrics": [
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
"metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction,score_prefix=graniteguardian_]",
"metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
]
},
"templates": [
"templates.generation.empty"
],
"templates": {
"default": {
"__type__": "input_output_template",
"input_format": "{prompt}",
"output_format": ""
}
},
"__description__": "100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
"__tags__": {
"languages": [
Expand Down
8 changes: 7 additions & 1 deletion src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6101,6 +6101,9 @@ def get_prompt(self, messages):
)

def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
# TODO replace with logic inside verify_granite_guardian_config and process_input_fields
task_data["prediction"] = prediction

self.verify_granite_guardian_config(task_data)
self.set_main_score()

Expand All @@ -6114,7 +6117,10 @@ def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> di
)
messages = self.process_input_fields(task_data)
prompt = self.get_prompt(messages)
result = self.inference_engine.infer_log_probs([{"source": prompt}])
data_classification_policy = task_data.get("metadata", {}).get("data_classification_policy")

result = self.inference_engine.infer_log_probs([{"source": prompt, "data_classification_policy": data_classification_policy}])

generated_tokens_list = result[0]
label, prob_of_risk = self.parse_output(generated_tokens_list)
confidence_score = (
Expand Down
Loading