Skip to content

Commit dc0cab5

Browse files
authored
Merge branch 'main' into fix-loading
2 parents 78fc482 + 49cd166 commit dc0cab5

File tree

2,396 files changed

+174936
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,396 files changed

+174936
-2
lines changed

prepare/cards/global_mmlu.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
from unitxt.card import TaskCard
2+
from unitxt.catalog import add_to_catalog
3+
from unitxt.loaders import LoadHF
4+
from unitxt.operators import (
5+
Deduplicate,
6+
FilterByCondition,
7+
ListFieldValues,
8+
MapInstanceValues,
9+
Set,
10+
)
11+
from unitxt.splitters import RenameSplits
12+
from unitxt.test_utils.card import test_card
13+
14+
languages = [
15+
"am",
16+
"ar",
17+
"bn",
18+
"cs",
19+
"de",
20+
"el",
21+
"en",
22+
"es",
23+
"fa",
24+
"fil",
25+
"fr",
26+
"ha",
27+
"he",
28+
"hi",
29+
"id",
30+
"ig",
31+
"it",
32+
"ja",
33+
"ko",
34+
"ky",
35+
"lt",
36+
"mg",
37+
"ms",
38+
"ne",
39+
"nl",
40+
"ny",
41+
"pl",
42+
"pt",
43+
"ro",
44+
"ru",
45+
"si",
46+
"sn",
47+
"so",
48+
"sr",
49+
"sv",
50+
"sw",
51+
"te",
52+
"tr",
53+
"uk",
54+
"vi",
55+
"yo",
56+
"zh",
57+
]
58+
subtasks = [
59+
"abstract_algebra",
60+
"anatomy",
61+
"astronomy",
62+
"business_ethics",
63+
"clinical_knowledge",
64+
"college_biology",
65+
"college_chemistry",
66+
"college_computer_science",
67+
"college_mathematics",
68+
"college_medicine",
69+
"college_physics",
70+
"computer_security",
71+
"conceptual_physics",
72+
"econometrics",
73+
"electrical_engineering",
74+
"elementary_mathematics",
75+
"formal_logic",
76+
"global_facts",
77+
"high_school_biology",
78+
"high_school_chemistry",
79+
"high_school_computer_science",
80+
"high_school_european_history",
81+
"high_school_geography",
82+
"high_school_government_and_politics",
83+
"high_school_macroeconomics",
84+
"high_school_mathematics",
85+
"high_school_microeconomics",
86+
"high_school_physics",
87+
"high_school_psychology",
88+
"high_school_statistics",
89+
"high_school_us_history",
90+
"high_school_world_history",
91+
"human_aging",
92+
"human_sexuality",
93+
"international_law",
94+
"jurisprudence",
95+
"logical_fallacies",
96+
"machine_learning",
97+
"management",
98+
"marketing",
99+
"medical_genetics",
100+
"miscellaneous",
101+
"moral_disputes",
102+
"moral_scenarios",
103+
"nutrition",
104+
"philosophy",
105+
"prehistory",
106+
"professional_accounting",
107+
"professional_law",
108+
"professional_medicine",
109+
"professional_psychology",
110+
"public_relations",
111+
"security_studies",
112+
"sociology",
113+
"us_foreign_policy",
114+
"virology",
115+
"world_religions",
116+
]
117+
118+
119+
is_first = True
120+
for language in languages:
121+
for subject in subtasks:
122+
card = TaskCard(
123+
loader=LoadHF(path="CohereForAI/Global-MMLU", name=language),
124+
preprocess_steps=[
125+
FilterByCondition(values={"subject": subject}, condition="eq"),
126+
Deduplicate(by=["question", "subject", "answer"]),
127+
RenameSplits({"dev": "train"}),
128+
MapInstanceValues(
129+
mappers={
130+
"answer": {
131+
"A": 0,
132+
"B": 1,
133+
"C": 2,
134+
"D": 3,
135+
}
136+
}
137+
),
138+
ListFieldValues(
139+
fields=["option_a", "option_b", "option_c", "option_d"],
140+
to_field="choices",
141+
),
142+
Set({"topic": subject.replace("_", " ")}),
143+
],
144+
task="tasks.qa.multiple_choice.with_topic",
145+
templates="templates.qa.multiple_choice.with_topic.all",
146+
__tags__={
147+
"annotations_creators": "expert-generated",
148+
"language": language,
149+
"language_creators": "expert-generated",
150+
"license": "apache-2.0",
151+
"multilinguality": "multilingual",
152+
"size_categories": "10K<n<100K",
153+
"source_datasets": "original",
154+
"task_categories": "question-answering",
155+
"task_ids": "multiple-choice-qa",
156+
"region": "global",
157+
},
158+
__description__=(
159+
"Global-MMLU is a multilingual evaluation set spanning 42 languages, combining machine translations "
160+
"for MMLU questions along with professional translations and crowd-sourced post-edits. The dataset "
161+
"includes cultural sensitivity annotations, classifying questions as Culturally Sensitive (CS) or "
162+
"Culturally Agnostic (CA)️. This initiative was led by Cohere For AI in collaboration with external "
163+
"contributors from industry and academia. The test spans subjects in humanities, social sciences, hard "
164+
"sciences, and other areas. See the full description on the dataset page: "
165+
"https://huggingface.co/datasets/CohereForAI/Global-MMLU"
166+
),
167+
)
168+
169+
if is_first:
170+
test_card(card, strict=False)
171+
is_first = False
172+
add_to_catalog(card, f"cards.global_mmlu.{language}.{subject}", overwrite=True)

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ tests = [
104104
"bs4",
105105
"tenacity==8.3.0",
106106
"accelerate",
107-
"spacy",
107+
"spacy",
108108
"func_timeout==4.3.5",
109109
"Wikipedia-API",
110110
"sqlglot",
@@ -246,7 +246,7 @@ extend-immutable-calls = ["fastapi.Depends", "fastapi.params.Depends", "fastapi.
246246
"src".msg = "Use unitxt outside src/ and relative imports inside src/ and install unitxt from source with `pip install -e '.[dev]'`."
247247

248248
[tool.codespell]
249-
ignore-words-list = 'rouge,ot,ans,nd,cann,som,tha,vie,ment,criterias,atleast'
249+
ignore-words-list = 'rouge,ot,ans,nd,cann,som,tha,vie,ment,criterias,atleast,te'
250250
check-filenames = true
251251
check-hidden = false
252252
regex = "(?<![a-z])[a-z'`]+|[A-Z][a-z'`]*|[a-z]+'[a-z]*|[a-z]+(?=[_-])|[a-z]+(?=[A-Z])|\\d+"
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{
2+
"__type__": "task_card",
3+
"loader": {
4+
"__type__": "load_hf",
5+
"path": "CohereForAI/Global-MMLU",
6+
"name": "am"
7+
},
8+
"preprocess_steps": [
9+
{
10+
"__type__": "filter_by_condition",
11+
"values": {
12+
"subject": "abstract_algebra"
13+
},
14+
"condition": "eq"
15+
},
16+
{
17+
"__type__": "deduplicate",
18+
"by": [
19+
"question",
20+
"subject",
21+
"answer"
22+
]
23+
},
24+
{
25+
"__type__": "rename_splits",
26+
"mapper": {
27+
"dev": "train"
28+
}
29+
},
30+
{
31+
"__type__": "map_instance_values",
32+
"mappers": {
33+
"answer": {
34+
"A": 0,
35+
"B": 1,
36+
"C": 2,
37+
"D": 3
38+
}
39+
}
40+
},
41+
{
42+
"__type__": "list_field_values",
43+
"fields": [
44+
"option_a",
45+
"option_b",
46+
"option_c",
47+
"option_d"
48+
],
49+
"to_field": "choices"
50+
},
51+
{
52+
"__type__": "set",
53+
"fields": {
54+
"topic": "abstract algebra"
55+
}
56+
}
57+
],
58+
"task": "tasks.qa.multiple_choice.with_topic",
59+
"templates": "templates.qa.multiple_choice.with_topic.all",
60+
"__tags__": {
61+
"annotations_creators": "expert-generated",
62+
"language": "am",
63+
"language_creators": "expert-generated",
64+
"license": "apache-2.0",
65+
"multilinguality": "multilingual",
66+
"size_categories": "10K<n<100K",
67+
"source_datasets": "original",
68+
"task_categories": "question-answering",
69+
"task_ids": "multiple-choice-qa",
70+
"region": "global"
71+
},
72+
"__description__": "Global-MMLU is a multilingual evaluation set spanning 42 languages, combining machine translations for MMLU questions along with professional translations and crowd-sourced post-edits. The dataset includes cultural sensitivity annotations, classifying questions as Culturally Sensitive (CS) or Culturally Agnostic (CA)️. This initiative was led by Cohere For AI in collaboration with external contributors from industry and academia. The test spans subjects in humanities, social sciences, hard sciences, and other areas. See the full description on the dataset page: https://huggingface.co/datasets/CohereForAI/Global-MMLU"
73+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{
2+
"__type__": "task_card",
3+
"loader": {
4+
"__type__": "load_hf",
5+
"path": "CohereForAI/Global-MMLU",
6+
"name": "am"
7+
},
8+
"preprocess_steps": [
9+
{
10+
"__type__": "filter_by_condition",
11+
"values": {
12+
"subject": "anatomy"
13+
},
14+
"condition": "eq"
15+
},
16+
{
17+
"__type__": "deduplicate",
18+
"by": [
19+
"question",
20+
"subject",
21+
"answer"
22+
]
23+
},
24+
{
25+
"__type__": "rename_splits",
26+
"mapper": {
27+
"dev": "train"
28+
}
29+
},
30+
{
31+
"__type__": "map_instance_values",
32+
"mappers": {
33+
"answer": {
34+
"A": 0,
35+
"B": 1,
36+
"C": 2,
37+
"D": 3
38+
}
39+
}
40+
},
41+
{
42+
"__type__": "list_field_values",
43+
"fields": [
44+
"option_a",
45+
"option_b",
46+
"option_c",
47+
"option_d"
48+
],
49+
"to_field": "choices"
50+
},
51+
{
52+
"__type__": "set",
53+
"fields": {
54+
"topic": "anatomy"
55+
}
56+
}
57+
],
58+
"task": "tasks.qa.multiple_choice.with_topic",
59+
"templates": "templates.qa.multiple_choice.with_topic.all",
60+
"__tags__": {
61+
"annotations_creators": "expert-generated",
62+
"language": "am",
63+
"language_creators": "expert-generated",
64+
"license": "apache-2.0",
65+
"multilinguality": "multilingual",
66+
"size_categories": "10K<n<100K",
67+
"source_datasets": "original",
68+
"task_categories": "question-answering",
69+
"task_ids": "multiple-choice-qa",
70+
"region": "global"
71+
},
72+
"__description__": "Global-MMLU is a multilingual evaluation set spanning 42 languages, combining machine translations for MMLU questions along with professional translations and crowd-sourced post-edits. The dataset includes cultural sensitivity annotations, classifying questions as Culturally Sensitive (CS) or Culturally Agnostic (CA)️. This initiative was led by Cohere For AI in collaboration with external contributors from industry and academia. The test spans subjects in humanities, social sciences, hard sciences, and other areas. See the full description on the dataset page: https://huggingface.co/datasets/CohereForAI/Global-MMLU"
73+
}

0 commit comments

Comments
 (0)