Fixes to GraniteGuardian metric,, safety evals cleanups (#1690)

bnayahu · elronbandel · web-flow · commit bcf5b4a2b023 · 2025-03-19T11:09:27.000+02:00
* Fixed an issue with GraniteGuardian metric, and switched to a generic template.

Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;

* Removal of redundant steps

Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;

* Fix missing predictions and classification policy in evaluated dataset

Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;

* Safer data_classification_policy handling

Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;

* Interim solution to make the prediction available to the metric

Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;

---------

Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;
Co-authored-by: Elron Bandel &lt;elronbandel@gmail.com&gt;
diff --git a/prepare/cards/safety/mlcommons_ailuminate.py b/prepare/cards/safety/mlcommons_ailuminate.py
@@ -5,24 +5,20 @@
     TaskCard,
 )
 from unitxt.loaders import LoadCSV
-from unitxt.operators import Rename
 from unitxt.templates import TemplatesDict
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadCSV(files={ "test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"},   # pragma: allowlist secret
-                  data_classification_policy=["public"]),
-    preprocess_steps=[
-        Rename(field_to_field={"prompt_text": "input",
-                               "hazard": "label" }),
-    ],
+    loader=LoadCSV(
+        files={"test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"},   # pragma: allowlist secret
+        data_classification_policy=["public"]),
     task=Task(
-        input_fields={"release_prompt_id": str, "input": str, "label": str, "persona": str, "locale": str, "prompt_hash": str},
-        reference_fields={"label": str},
+        input_fields={"release_prompt_id": str, "prompt_text": str, "hazard": str, "persona": str, "locale": str, "prompt_hash": str},
+        reference_fields={},
         prediction_type=str,
         metrics=["metrics.llm_as_judge.safety.llamaguard"]
     ),
-    templates=TemplatesDict({"default": InputOutputTemplate(input_format="{input}\n", output_format="")}),
+    templates=TemplatesDict({"default": InputOutputTemplate(input_format="{prompt_text}\n", output_format="")}),
 
     __description__="The DEMO prompt library of the AILuminate 1.0 prompt dataset, created by MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that cover 12 different hazard categories. The dataset is a mix of adversarial and non-adversarial prompts. This dataset is a 10% subset of the full AILuminate training dataset.",
     __tags__={
diff --git a/prepare/cards/safety/simple_safety_tests.py b/prepare/cards/safety/simple_safety_tests.py
@@ -4,34 +4,23 @@
     Task,
     TaskCard,
 )
-from unitxt.operators import (
-    Rename,
-    Set,
-)
+from unitxt.templates import InputOutputTemplate
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(
         path="Bertievidgen/SimpleSafetyTests", data_classification_policy=["public"]
     ),
-    preprocess_steps=[
-        Rename(field_to_field={"prompt": "input"}),
-        Set(
-            fields={
-                "output": "",
-            }
-        ),
-    ],
     task=Task(
-        input_fields={"input": str, "id": str, "harm_area": str, "category": str},
-        reference_fields={"output": str},
+        input_fields={"prompt": str, "id": str, "harm_area": str, "category": str},
+        reference_fields={},
         prediction_type=str,
         metrics=[
-            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction,score_prefix=graniteguardian_]",
             "metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
         ],
     ),
-    templates=["templates.generation.empty"],
+    templates={"default": InputOutputTemplate(input_format="{prompt}", output_format="")},
     __description__="100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
     __tags__={
         "languages": ["english"],
diff --git a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json
@@ -9,28 +9,17 @@
             "public"
         ]
     },
-    "preprocess_steps": [
-        {
-            "__type__": "rename",
-            "field_to_field": {
-                "prompt_text": "input",
-                "hazard": "label"
-            }
-        }
-    ],
     "task": {
         "__type__": "task",
         "input_fields": {
             "release_prompt_id": "str",
-            "input": "str",
-            "label": "str",
+            "prompt_text": "str",
+            "hazard": "str",
             "persona": "str",
             "locale": "str",
             "prompt_hash": "str"
         },
-        "reference_fields": {
-            "label": "str"
-        },
+        "reference_fields": {},
         "prediction_type": "str",
         "metrics": [
             "metrics.llm_as_judge.safety.llamaguard"
@@ -41,7 +30,7 @@
         "items": {
             "default": {
                 "__type__": "input_output_template",
-                "input_format": "{input}\n",
+                "input_format": "{prompt_text}\n",
                 "output_format": ""
             }
         }
diff --git a/src/unitxt/catalog/cards/safety/simple_safety_tests.json b/src/unitxt/catalog/cards/safety/simple_safety_tests.json
@@ -7,40 +7,28 @@
             "public"
         ]
     },
-    "preprocess_steps": [
-        {
-            "__type__": "rename",
-            "field_to_field": {
-                "prompt": "input"
-            }
-        },
-        {
-            "__type__": "set",
-            "fields": {
-                "output": ""
-            }
-        }
-    ],
     "task": {
         "__type__": "task",
         "input_fields": {
-            "input": "str",
+            "prompt": "str",
             "id": "str",
             "harm_area": "str",
             "category": "str"
         },
-        "reference_fields": {
-            "output": "str"
-        },
+        "reference_fields": {},
         "prediction_type": "str",
         "metrics": [
-            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction,score_prefix=graniteguardian_]",
             "metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
         ]
     },
-    "templates": [
-        "templates.generation.empty"
-    ],
+    "templates": {
+        "default": {
+            "__type__": "input_output_template",
+            "input_format": "{prompt}",
+            "output_format": ""
+        }
+    },
     "__description__": "100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
     "__tags__": {
         "languages": [
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -6101,6 +6101,9 @@ def get_prompt(self, messages):
         )
 
     def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
+        # TODO replace with logic inside verify_granite_guardian_config and process_input_fields
+        task_data["prediction"] = prediction
+
         self.verify_granite_guardian_config(task_data)
         self.set_main_score()
 
@@ -6114,7 +6117,10 @@ def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> di
         )
         messages = self.process_input_fields(task_data)
         prompt = self.get_prompt(messages)
-        result = self.inference_engine.infer_log_probs([{"source": prompt}])
+        data_classification_policy = task_data.get("metadata", {}).get("data_classification_policy")
+
+        result = self.inference_engine.infer_log_probs([{"source": prompt, "data_classification_policy": data_classification_policy}])
+
         generated_tokens_list = result[0]
         label, prob_of_risk = self.parse_output(generated_tokens_list)
         confidence_score = (