Skip to content

Commit a4aca82

Browse files
bnayahuelronbandel
andauthored
Update BlueBench to match the original implementation (#1855)
* Updates to BlueBench to better match the internal version. Signed-off-by: Jonathan Bnayahu <[email protected]> * Resolve warning on undelared fields. Signed-off-by: Jonathan Bnayahu <[email protected]> * Switch to the detailed entity extraction template Signed-off-by: Jonathan Bnayahu <[email protected]> --------- Signed-off-by: Jonathan Bnayahu <[email protected]> Co-authored-by: Elron Bandel <[email protected]>
1 parent c032820 commit a4aca82

File tree

60 files changed

+121
-82
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+121
-82
lines changed

examples/evaluate_bluebench.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
######################################################################################################
4+
# BlueBench is an open-source benchmark developed by domain experts to represent required needs of #
5+
# Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure #
6+
# validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text #
7+
# processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains #
8+
# such as legal, finance, customer support, and news. It also evaluates a range of capabilities, #
9+
# including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks #
10+
# and domains to be integrated over time. #
11+
# #
12+
# Further details: https://ibm.biz/bluebench #
13+
######################################################################################################
14+
15+
if [ $# -eq 0 ]
16+
then
17+
echo "Usage: evaluate_bluebench.sh model-to-evaluate-in-litellm-format"
18+
exit 1
19+
fi
20+
21+
unitxt-evaluate \
22+
--tasks "benchmarks.bluebench" \
23+
--model cross_provider \
24+
--model_args "model_name=$1,max_tokens=1024" \
25+
--output_path ./results/bluebench \
26+
--log_samples \
27+
--trust_remote_code \
28+
--batch_size 8 \
29+
--verbosity ERROR
30+
31+
unitxt-summarize ./results/bluebench

prepare/cards/attaq_500.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,8 +519,9 @@
519519
Shuffle(page_size=2800),
520520
],
521521
task=Task(
522-
input_fields=["input"],
523-
reference_fields=["label"],
522+
input_fields={"input": str},
523+
reference_fields={"label": str},
524+
prediction_type=str,
524525
metrics=["metrics.safety_metric"],
525526
),
526527
templates=[

prepare/recipes/bluebench.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
"template_card_index": 1,
6767
"max_train_instances": 1000,
6868
"max_validation_instances": 1000,
69-
"max_test_instances": 1000,
69+
"max_test_instances": 100,
7070
}
7171

7272

@@ -82,6 +82,8 @@ def prepare_recipe(default_args, specific_args):
8282

8383
if "template" in recipe and "template_card_index" in recipe:
8484
del recipe["template_card_index"]
85+
86+
# Note: BlueBench only uses the chat_api format.
8587
return DatasetRecipe(**recipe, format="formats.chat_api")
8688

8789

@@ -145,6 +147,7 @@ def prepare_recipe(default_args, specific_args):
145147
ingridients = {
146148
"card": "cards.20_newsgroups_short",
147149
"template": "templates.classification.multi_class.bluebench",
150+
"num_demos": 1,
148151
}
149152
recipe = prepare_recipe(default_args, ingridients)
150153
add_to_catalog(
@@ -159,7 +162,7 @@ def prepare_recipe(default_args, specific_args):
159162
"card": f"cards.safety.bbq.{subset}",
160163
"demos_pool_size": 20,
161164
"num_demos": 5,
162-
"template": "templates.qa.multiple_choice.with_context.match",
165+
"template": "templates.qa.multiple_choice.with_context.bluebench",
163166
"demos_taken_from": "test",
164167
}
165168
recipe = prepare_recipe(default_args, ingridients)
@@ -178,6 +181,7 @@ def prepare_recipe(default_args, specific_args):
178181
"demos_pool_size": 10,
179182
"template": "templates.classification.multi_class.bluebench",
180183
"demos_taken_from": "test",
184+
"num_demos": 1,
181185
}
182186
recipe = prepare_recipe(default_args, ingridients)
183187
add_to_catalog(
@@ -235,10 +239,11 @@ def prepare_recipe(default_args, specific_args):
235239
"card": f"cards.universal_ner.{subset}",
236240
"demos_pool_size": 10000,
237241
"num_demos": 5,
238-
"template": "templates.span_labeling.extraction.title",
242+
"template": "templates.span_labeling.extraction.detailed",
239243
"metrics": ["metrics.ner[zero_division=1.0]"],
240244
"train_refiner": "operators.balancers.ner.zero_vs_many_entities[segments_boundaries=[0,1,2]]",
241245
"demos_taken_from": "test" if "pud" in subset else "train",
246+
"max_train_instances": 10000,
242247
}
243248
recipe = prepare_recipe(default_args, ingridients)
244249
add_to_catalog(
@@ -293,6 +298,7 @@ def prepare_recipe(default_args, specific_args):
293298
ingridients = {
294299
"card": "cards.rag.response_generation.clapnq",
295300
"template": "templates.rag.response_generation.bluebench",
301+
"num_demos": 1,
296302
}
297303
recipe = prepare_recipe(default_args, ingridients)
298304
add_to_catalog(
@@ -306,7 +312,7 @@ def prepare_recipe(default_args, specific_args):
306312

307313
ingridients = {
308314
"card": "cards.fin_qa",
309-
"num_demos": 1,
315+
"num_demos": 2,
310316
"template_card_index": 0,
311317
}
312318
recipe = prepare_recipe(default_args, ingridients)

src/unitxt/catalog/cards/attaq_500.json

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -526,12 +526,13 @@
526526
],
527527
"task": {
528528
"__type__": "task",
529-
"input_fields": [
530-
"input"
531-
],
532-
"reference_fields": [
533-
"label"
534-
],
529+
"input_fields": {
530+
"input": "str"
531+
},
532+
"reference_fields": {
533+
"label": "str"
534+
},
535+
"prediction_type": "str",
535536
"metrics": [
536537
"metrics.safety_metric"
537538
]

src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_age.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"demos_taken_from": "test",
66
"max_train_instances": 1000,
77
"max_validation_instances": 1000,
8-
"max_test_instances": 90,
8+
"max_test_instances": 9,
99
"card": "cards.safety.bbq.Age",
10-
"template": "templates.qa.multiple_choice.with_context.match",
10+
"template": "templates.qa.multiple_choice.with_context.bluebench",
1111
"format": "formats.chat_api"
1212
}

src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_disability_status.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"demos_taken_from": "test",
66
"max_train_instances": 1000,
77
"max_validation_instances": 1000,
8-
"max_test_instances": 90,
8+
"max_test_instances": 9,
99
"card": "cards.safety.bbq.Disability_status",
10-
"template": "templates.qa.multiple_choice.with_context.match",
10+
"template": "templates.qa.multiple_choice.with_context.bluebench",
1111
"format": "formats.chat_api"
1212
}

src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_gender_identity.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"demos_taken_from": "test",
66
"max_train_instances": 1000,
77
"max_validation_instances": 1000,
8-
"max_test_instances": 90,
8+
"max_test_instances": 9,
99
"card": "cards.safety.bbq.Gender_identity",
10-
"template": "templates.qa.multiple_choice.with_context.match",
10+
"template": "templates.qa.multiple_choice.with_context.bluebench",
1111
"format": "formats.chat_api"
1212
}

src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_nationality.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"demos_taken_from": "test",
66
"max_train_instances": 1000,
77
"max_validation_instances": 1000,
8-
"max_test_instances": 90,
8+
"max_test_instances": 9,
99
"card": "cards.safety.bbq.Nationality",
10-
"template": "templates.qa.multiple_choice.with_context.match",
10+
"template": "templates.qa.multiple_choice.with_context.bluebench",
1111
"format": "formats.chat_api"
1212
}

src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_physical_appearance.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"demos_taken_from": "test",
66
"max_train_instances": 1000,
77
"max_validation_instances": 1000,
8-
"max_test_instances": 90,
8+
"max_test_instances": 9,
99
"card": "cards.safety.bbq.Physical_appearance",
10-
"template": "templates.qa.multiple_choice.with_context.match",
10+
"template": "templates.qa.multiple_choice.with_context.bluebench",
1111
"format": "formats.chat_api"
1212
}

src/unitxt/catalog/recipes/bluebench/bias/safety_bbq_race_ethnicity.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"demos_taken_from": "test",
66
"max_train_instances": 1000,
77
"max_validation_instances": 1000,
8-
"max_test_instances": 90,
8+
"max_test_instances": 9,
99
"card": "cards.safety.bbq.Race_ethnicity",
10-
"template": "templates.qa.multiple_choice.with_context.match",
10+
"template": "templates.qa.multiple_choice.with_context.bluebench",
1111
"format": "formats.chat_api"
1212
}

0 commit comments

Comments
 (0)