IBM
diff --git a/‎examples/evaluate_bluebench.sh‎
Lines changed: 31 additions & 0 deletions b/‎examples/evaluate_bluebench.sh‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎prepare/cards/attaq_500.py‎
Lines changed: 3 additions & 2 deletions b/‎prepare/cards/attaq_500.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎prepare/recipes/bluebench.py‎
Lines changed: 10 additions & 4 deletions b/‎prepare/recipes/bluebench.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/unitxt/catalog/cards/attaq_500.json‎
Lines changed: 7 additions & 6 deletions b/‎src/unitxt/catalog/cards/attaq_500.json‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_age.json‎
Lines changed: 2 additions & 2 deletions b/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_age.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_disability_status.json‎
Lines changed: 2 additions & 2 deletions b/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_disability_status.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_gender_identity.json‎
Lines changed: 2 additions & 2 deletions b/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_gender_identity.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_nationality.json‎
Lines changed: 2 additions & 2 deletions b/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_nationality.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_physical_appearance.json‎
Lines changed: 2 additions & 2 deletions b/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_physical_appearance.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_race_ethnicity.json‎
Lines changed: 2 additions & 2 deletions b/‎src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_race_ethnicity.json‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+######################################################################################################
+# BlueBench is an open-source benchmark developed by domain experts to represent required needs of   #
+# Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure    #
+# validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text #
+# processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains   #
+# such as legal, finance, customer support, and news. It also evaluates a range of capabilities,     #
+# including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks  #
+# and domains to be integrated over time.                                                            #
+#                                                                                                    #
+# Further details: https://ibm.biz/bluebench                                                         #
+######################################################################################################
+
+if [ $# -eq 0 ]
+  then
+    echo "Usage: evaluate_bluebench.sh model-to-evaluate-in-litellm-format"
+    exit 1
+fi
+
+unitxt-evaluate \
+    --tasks "benchmarks.bluebench" \
+    --model cross_provider \
+    --model_args "model_name=$1,max_tokens=1024" \
+    --output_path ./results/bluebench \
+    --log_samples \
+    --trust_remote_code \
+    --batch_size 8 \
+    --verbosity ERROR
+
+unitxt-summarize ./results/bluebench
@@ -519,8 +519,9 @@
         Shuffle(page_size=2800),
     ],
     task=Task(
-        input_fields=["input"],
-        reference_fields=["label"],
+        input_fields={"input": str},
+        reference_fields={"label": str},
+        prediction_type=str,
         metrics=["metrics.safety_metric"],
     ),
     templates=[
 
@@ -66,7 +66,7 @@
     "template_card_index": 1,
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 1000,
+    "max_test_instances": 100,
 }
 
 
@@ -82,6 +82,8 @@ def prepare_recipe(default_args, specific_args):
 
     if "template" in recipe and "template_card_index" in recipe:
         del recipe["template_card_index"]
+
+    # Note: BlueBench only uses the chat_api format.
     return DatasetRecipe(**recipe, format="formats.chat_api")
 
 
@@ -145,6 +147,7 @@ def prepare_recipe(default_args, specific_args):
 ingridients = {
     "card": "cards.20_newsgroups_short",
     "template": "templates.classification.multi_class.bluebench",
+    "num_demos": 1,
 }
 recipe = prepare_recipe(default_args, ingridients)
 add_to_catalog(
@@ -159,7 +162,7 @@ def prepare_recipe(default_args, specific_args):
         "card": f"cards.safety.bbq.{subset}",
         "demos_pool_size": 20,
         "num_demos": 5,
-        "template": "templates.qa.multiple_choice.with_context.match",
+        "template": "templates.qa.multiple_choice.with_context.bluebench",
         "demos_taken_from": "test",
     }
     recipe = prepare_recipe(default_args, ingridients)
@@ -178,6 +181,7 @@ def prepare_recipe(default_args, specific_args):
         "demos_pool_size": 10,
         "template": "templates.classification.multi_class.bluebench",
         "demos_taken_from": "test",
+        "num_demos": 1,
     }
     recipe = prepare_recipe(default_args, ingridients)
     add_to_catalog(
@@ -235,10 +239,11 @@ def prepare_recipe(default_args, specific_args):
         "card": f"cards.universal_ner.{subset}",
         "demos_pool_size": 10000,
         "num_demos": 5,
-        "template": "templates.span_labeling.extraction.title",
+        "template": "templates.span_labeling.extraction.detailed",
         "metrics": ["metrics.ner[zero_division=1.0]"],
         "train_refiner": "operators.balancers.ner.zero_vs_many_entities[segments_boundaries=[0,1,2]]",
         "demos_taken_from": "test" if "pud" in subset else "train",
+        "max_train_instances": 10000,
     }
     recipe = prepare_recipe(default_args, ingridients)
     add_to_catalog(
@@ -293,6 +298,7 @@ def prepare_recipe(default_args, specific_args):
 ingridients = {
     "card": "cards.rag.response_generation.clapnq",
     "template": "templates.rag.response_generation.bluebench",
+    "num_demos": 1,
 }
 recipe = prepare_recipe(default_args, ingridients)
 add_to_catalog(
@@ -306,7 +312,7 @@ def prepare_recipe(default_args, specific_args):
 
 ingridients = {
     "card": "cards.fin_qa",
-    "num_demos": 1,
+    "num_demos": 2,
     "template_card_index": 0,
 }
 recipe = prepare_recipe(default_args, ingridients)
 
@@ -526,12 +526,13 @@
     ],
     "task": {
         "__type__": "task",
-        "input_fields": [
-            "input"
-        ],
-        "reference_fields": [
-            "label"
-        ],
+        "input_fields": {
+            "input": "str"
+        },
+        "reference_fields": {
+            "label": "str"
+        },
+        "prediction_type": "str",
         "metrics": [
             "metrics.safety_metric"
         ]
 
@@ -5,8 +5,8 @@
     "demos_taken_from": "test",
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 90,
+    "max_test_instances": 9,
     "card": "cards.safety.bbq.Age",
-    "template": "templates.qa.multiple_choice.with_context.match",
+    "template": "templates.qa.multiple_choice.with_context.bluebench",
     "format": "formats.chat_api"
 }
@@ -5,8 +5,8 @@
     "demos_taken_from": "test",
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 90,
+    "max_test_instances": 9,
     "card": "cards.safety.bbq.Disability_status",
-    "template": "templates.qa.multiple_choice.with_context.match",
+    "template": "templates.qa.multiple_choice.with_context.bluebench",
     "format": "formats.chat_api"
 }
@@ -5,8 +5,8 @@
     "demos_taken_from": "test",
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 90,
+    "max_test_instances": 9,
     "card": "cards.safety.bbq.Gender_identity",
-    "template": "templates.qa.multiple_choice.with_context.match",
+    "template": "templates.qa.multiple_choice.with_context.bluebench",
     "format": "formats.chat_api"
 }
@@ -5,8 +5,8 @@
     "demos_taken_from": "test",
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 90,
+    "max_test_instances": 9,
     "card": "cards.safety.bbq.Nationality",
-    "template": "templates.qa.multiple_choice.with_context.match",
+    "template": "templates.qa.multiple_choice.with_context.bluebench",
     "format": "formats.chat_api"
 }
@@ -5,8 +5,8 @@
     "demos_taken_from": "test",
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 90,
+    "max_test_instances": 9,
     "card": "cards.safety.bbq.Physical_appearance",
-    "template": "templates.qa.multiple_choice.with_context.match",
+    "template": "templates.qa.multiple_choice.with_context.bluebench",
     "format": "formats.chat_api"
 }
@@ -5,8 +5,8 @@
     "demos_taken_from": "test",
     "max_train_instances": 1000,
     "max_validation_instances": 1000,
-    "max_test_instances": 90,
+    "max_test_instances": 9,
     "card": "cards.safety.bbq.Race_ethnicity",
-    "template": "templates.qa.multiple_choice.with_context.match",
+    "template": "templates.qa.multiple_choice.with_context.bluebench",
     "format": "formats.chat_api"
 }