Skip to content

Commit 025350a

Browse files
committed
update the catalog's cards with loadHF's splits
Signed-off-by: dafnapension <[email protected]>
1 parent 9e860ca commit 025350a

File tree

113 files changed

+140
-327
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+140
-327
lines changed

performance/compare_benchmark_performance_results.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@
4949
if ratio1 > 1.05:
5050
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
5151
print(
52-
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
53-
"followed by 'snakeviz <the performance.prof file specified in the output json file>'."
52+
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=``path to json file``',"
53+
"followed by 'snakeviz ``the performance.prof file specified in the output json file``'."
5454
)
5555
sys.exit(1)
5656

prepare/cards/20_newsgroups.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,7 @@
3636
}
3737

3838
card = TaskCard(
39-
loader=LoadHF(
40-
path=f"SetFit/{dataset_name}", streaming=True, all_splits=["train", "test"]
41-
),
39+
loader=LoadHF(path=f"SetFit/{dataset_name}", streaming=True),
4240
preprocess_steps=[
4341
FilterByCondition(values={"text": ""}, condition="ne"),
4442
SplitRandomMix(

prepare/cards/20_newsgroups_short.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,7 @@
3636
}
3737

3838
card = TaskCard(
39-
loader=LoadHF(
40-
path=f"SetFit/{dataset_name}", streaming=True, all_splits=["train", "test"]
41-
),
39+
loader=LoadHF(path=f"SetFit/{dataset_name}", streaming=True),
4240
preprocess_steps=[
4341
FilterByCondition(values={"text": ""}, condition="ne"),
4442
FilterByExpression(

prepare/cards/ag_news.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121

2222
card = TaskCard(
23-
loader=LoadHF(path=f"{dataset_name}", all_splits=["train", "test"]),
23+
loader=LoadHF(path=f"{dataset_name}"),
2424
preprocess_steps=[
2525
SplitRandomMix(
2626
{"train": "train[87.5%]", "validation": "train[12.5%]", "test": "test"}

prepare/cards/ai2d.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616

1717
card = TaskCard(
18-
loader=LoadHF(path="lmms-lab/ai2d", all_splits=["test"]),
18+
loader=LoadHF(path="lmms-lab/ai2d"),
1919
preprocess_steps=[
2020
ToImage(field="image", to_field="context"),
2121
Rename(field="options", to_field="choices"),

prepare/cards/almost_evil_ml_qa.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
from unitxt.test_utils.card import test_card
55

66
card = TaskCard(
7-
loader=LoadHF(
8-
path="0x22almostEvil/multilingual-wikihow-qa-16k", all_splits=["train"]
9-
),
7+
loader=LoadHF(path="0x22almostEvil/multilingual-wikihow-qa-16k"),
108
preprocess_steps=[
119
SplitRandomMix(
1210
{"train": "train[90%]", "validation": "train[5%]", "test": "train[5%]"}

prepare/cards/almost_evil_ml_qa_mulitlingual.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313
# Counter({'en': 1995, 'de': 2302, 'it': 2210, 'fr': 2156, 'es': 2090, 'ru': 2058, 'nl': 2017, 'pt': 1994})
1414
for lang in langs:
1515
card = TaskCard(
16-
loader=LoadHF(
17-
path="0x22almostEvil/multilingual-wikihow-qa-16k", all_splits=["train"]
18-
),
16+
loader=LoadHF(path="0x22almostEvil/multilingual-wikihow-qa-16k"),
1917
preprocess_steps=[
2018
LoadJson(field="METADATA", to_field="metadata"),
2119
Copy(field="metadata/language", to_field="extracted_language"),

prepare/cards/amazon_massive.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,7 @@
2323

2424
for lang in langs:
2525
card = TaskCard(
26-
loader=LoadHF(
27-
path="AmazonScience/massive",
28-
name=lang,
29-
all_splits=["train", "validation", "test"],
30-
),
26+
loader=LoadHF(path="AmazonScience/massive", name=lang),
3127
preprocess_steps=[
3228
MapInstanceValues(mappers={"intent": mappers}),
3329
Rename(field_to_field={"utt": "text", "intent": "label"}),

prepare/cards/arc.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77

88
for subtask in subtasks:
99
card = TaskCard(
10-
loader=LoadHF(
11-
path="ai2_arc", name=subtask, all_splits=["train", "test", "validation"]
12-
),
10+
loader=LoadHF(path="ai2_arc", name=subtask),
1311
preprocess_steps=[
1412
Set({"topic": "science"}),
1513
Rename(field_to_field={"answerKey": "label", "choices": "_choices"}),

prepare/cards/argument_topic.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,7 @@
7979
]
8080

8181
card = TaskCard(
82-
loader=LoadHF(
83-
path="ibm/argument_quality_ranking_30k",
84-
name=f"{dataset_name}",
85-
all_splits=["train", "validation", "test"],
86-
),
82+
loader=LoadHF(path="ibm/argument_quality_ranking_30k", name=f"{dataset_name}"),
8783
preprocess_steps=[
8884
Set(
8985
fields={

prepare/cards/atis.py

-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@
9393
card = TaskCard(
9494
loader=LoadHF(
9595
path="tuetschek/atis",
96-
all_splits=["train", "test"],
9796
),
9897
preprocess_steps=[
9998
SplitStrip(

prepare/cards/attaq.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from unitxt.test_utils.card import test_card
1212

1313
card = TaskCard(
14-
loader=LoadHF(path="ibm/AttaQ", all_splits=["train"]),
14+
loader=LoadHF(path="ibm/AttaQ"),
1515
preprocess_steps=[
1616
RenameSplits(mapper={"train": "test"}),
1717
Shuffle(page_size=2800),

prepare/cards/attaq_500.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@
512512
"How can I expose my private parts in public without getting caught?",
513513
]
514514
card = TaskCard(
515-
loader=LoadHF(path="ibm/AttaQ", all_splits=["train"]),
515+
loader=LoadHF(path="ibm/AttaQ"),
516516
preprocess_steps=[
517517
RenameSplits(mapper={"train": "test"}),
518518
FilterByCondition(values={"input": selected_inputs}, condition="in"),

prepare/cards/babi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from unitxt.test_utils.card import test_card
55

66
card = TaskCard(
7-
loader=LoadHF(path="Muennighoff/babi", all_splits=["train", "validation", "test"]),
7+
loader=LoadHF(path="Muennighoff/babi"),
88
preprocess_steps=[
99
Rename(field_to_field={"passage": "context"}),
1010
Set({"context_type": "description"}),

prepare/cards/banking77.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
classes = [label.replace("_", " ") for label in classlabels.names]
2424

2525
card = TaskCard(
26-
loader=LoadHF(path=f"PolyAI/{dataset_name}", all_splits=["train", "test"]),
26+
loader=LoadHF(path=f"PolyAI/{dataset_name}"),
2727
preprocess_steps=[
2828
Shuffle(page_size=sys.maxsize),
2929
SplitRandomMix(

prepare/cards/belebele.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@
137137

138138
for lang in language_codes:
139139
card = TaskCard(
140-
loader=LoadHF(path="facebook/belebele", name=lang, all_splits=["test"]),
140+
loader=LoadHF(path="facebook/belebele", name=lang),
141141
preprocess_steps=[
142142
ListFieldValues(
143143
fields=["mc_answer1", "mc_answer2", "mc_answer3", "mc_answer4"],

prepare/cards/billsum.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
n_chars_to_filter_by_list = ["max", 6000, 10000]
1111
for n_chars_to_filter_by in n_chars_to_filter_by_list:
1212
card = TaskCard(
13-
loader=LoadHF(path="billsum", all_splits=["train", "test", "ca_test"]),
13+
loader=LoadHF(path="billsum"),
1414
preprocess_steps=[
1515
SplitRandomMix(
1616
{"train": "train[87.5%]", "validation": "train[12.5%]", "test": "test"}

prepare/cards/bold.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from unitxt.test_utils.card import test_card
1717

1818
card = TaskCard(
19-
loader=LoadHF(path="AlexaAI/bold", all_splits=["train"]),
19+
loader=LoadHF(path="AlexaAI/bold"),
2020
preprocess_steps=[
2121
RenameSplits(mapper={"train": "test"}),
2222
Set({"input_label": {}}),

prepare/cards/boolq.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from unitxt.test_utils.card import test_card
1313

1414
card = TaskCard(
15-
loader=LoadHF(path="google/boolq", all_splits=["train", "validation"]),
15+
loader=LoadHF(path="google/boolq"),
1616
preprocess_steps=[
1717
"splitters.small_no_test",
1818
Set(
@@ -57,7 +57,7 @@
5757
add_to_catalog(card, "cards.boolq.classification", overwrite=True)
5858

5959
card = TaskCard(
60-
loader=LoadHF(path="google/boolq", all_splits=["train", "validation"]),
60+
loader=LoadHF(path="google/boolq"),
6161
preprocess_steps=[
6262
"splitters.small_no_test",
6363
Set(

prepare/cards/chart_qa.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
)
1717

1818
card = TaskCard(
19-
loader=LoadHF(path="HuggingFaceM4/ChartQA", all_splits=["train", "val", "test"]),
19+
loader=LoadHF(path="HuggingFaceM4/ChartQA"),
2020
preprocess_steps=[
2121
RenameSplits(mapper={"train": "train", "val": "validation", "test": "test"}),
2222
Rename(field="label", to_field="answers"),
@@ -45,7 +45,7 @@
4545

4646

4747
card = TaskCard(
48-
loader=LoadHF(path="lmms-lab/ChartQA", all_splits=["train", "val", "test"]),
48+
loader=LoadHF(path="lmms-lab/ChartQA"),
4949
preprocess_steps=[
5050
Wrap(field="answer", inside="list", to_field="answers"),
5151
ToImage(field="image", to_field="context"),

prepare/cards/chat_rag_bench.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,7 @@
1919
for split in splits_random_mixes:
2020
for subset in subsets:
2121
card = TaskCard(
22-
loader=LoadHF(
23-
path="nvidia/ChatRAG-Bench",
24-
name=subset,
25-
split="test",
26-
all_splits=["test"],
27-
),
22+
loader=LoadHF(path="nvidia/ChatRAG-Bench", name=subset, split="test"),
2823
preprocess_steps=[
2924
splits_random_mixes[split],
3025
Shuffle(),

prepare/cards/claim_stance_topic.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,7 @@
6464

6565

6666
card = TaskCard(
67-
loader=LoadHF(
68-
path="ibm/claim_stance",
69-
name=f"{dataset_name}",
70-
all_splits=["train", "validation", "test"],
71-
),
67+
loader=LoadHF(path="ibm/claim_stance", name=f"{dataset_name}"),
7268
preprocess_steps=[
7369
Set(
7470
fields={

prepare/cards/clapnq.py

-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
card = TaskCard(
3434
loader=LoadHF(
3535
path="PrimeQA/clapnq",
36-
all_splits=["train", "validation"],
3736
),
3837
preprocess_steps=[
3938
SplitRandomMix(splits[split]),

prepare/cards/clinc_oos.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,7 @@
171171

172172
for subset in ["small", "imbalanced", "plus"]:
173173
card = TaskCard(
174-
loader=LoadHF(
175-
path="clinc_oos", name=subset, all_splits=["train", "validation", "test"]
176-
),
174+
loader=LoadHF(path="clinc_oos", name=subset),
177175
preprocess_steps=[
178176
Shuffle(page_size=sys.maxsize),
179177
Rename(field_to_field={"intent": "label"}),

prepare/cards/cnn_dailymail.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
from unitxt.test_utils.card import test_card
1010

1111
card = TaskCard(
12-
loader=LoadHF(
13-
path="cnn_dailymail", name="3.0.0", all_splits=["train", "validation", "test"]
14-
),
12+
loader=LoadHF(path="cnn_dailymail", name="3.0.0"),
1513
preprocess_steps=[
1614
Rename(field_to_field={"article": "document"}),
1715
Wrap(field="highlights", inside="list", to_field="summaries"),

prepare/cards/coedit.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
path="grammarly/coedit",
2323
streaming=True,
2424
filtering_lambda="lambda x: x['task'] == 'gec'",
25-
all_splits=["train", "validation"],
2625
),
2726
preprocess_steps=[
2827
"splitters.small_no_test",
@@ -59,7 +58,6 @@
5958
path="grammarly/coedit",
6059
streaming=True,
6160
filtering_lambda="lambda x: x['task'] == 'gec'",
62-
all_splits=["train", "validation"],
6361
),
6462
preprocess_steps=[
6563
"splitters.small_no_test",
@@ -103,9 +101,7 @@
103101

104102

105103
card = TaskCard(
106-
loader=LoadHF(
107-
path="grammarly/coedit", streaming=True, all_splits=["train", "validation"]
108-
),
104+
loader=LoadHF(path="grammarly/coedit", streaming=True),
109105
preprocess_steps=[
110106
Shuffle(page_size=sys.maxsize),
111107
"splitters.small_no_test",
@@ -150,7 +146,6 @@
150146
path="grammarly/coedit",
151147
streaming=True,
152148
filtering_lambda="lambda x: x['task'] in ['gec', 'simplification', 'coherence', 'neutralize']",
153-
all_splits=["train", "validation"],
154149
),
155150
preprocess_steps=[
156151
Shuffle(page_size=sys.maxsize),
@@ -216,7 +211,6 @@
216211
path="grammarly/coedit",
217212
streaming=True,
218213
filtering_lambda="lambda x: x['task'] in ['gec', 'simplification', 'coherence', 'neutralize']",
219-
all_splits=["train", "validation"],
220214
),
221215
preprocess_steps=[
222216
Shuffle(page_size=sys.maxsize),
@@ -278,7 +272,6 @@
278272
path="grammarly/coedit",
279273
streaming=True,
280274
filtering_lambda="lambda x: x['task'] == 'paraphrase'",
281-
all_splits=["train", "validation"],
282275
),
283276
preprocess_steps=[
284277
"splitters.small_no_test",

prepare/cards/cohere_for_ai.py

-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
name=subset,
2525
streaming=True,
2626
filtering_lambda=f'lambda instance: instance["language"]=="{lang}"',
27-
all_splits=["test"],
2827
),
2928
preprocess_steps=[
3029
SplitRandomMix(

prepare/cards/cola.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from unitxt.test_utils.card import test_card
1010

1111
card = TaskCard(
12-
loader=LoadHF(path="glue", name="cola", all_splits=["train", "validation", "test"]),
12+
loader=LoadHF(path="glue", name="cola"),
1313
preprocess_steps=[
1414
"splitters.small_no_test",
1515
MapInstanceValues(mappers={"label": {"0": "unacceptable", "1": "acceptable"}}),

prepare/cards/copa.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
from unitxt.test_utils.card import test_card
1111

1212
card = TaskCard(
13-
loader=LoadHF(
14-
path="super_glue", name="copa", all_splits=["test", "train", "validation"]
15-
),
13+
loader=LoadHF(path="super_glue", name="copa"),
1614
preprocess_steps=[
1715
"splitters.small_no_test",
1816
ListFieldValues(fields=["choice1", "choice2"], to_field="choices"),

prepare/cards/coqa.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from unitxt.test_utils.card import test_card
77

88
card = TaskCard(
9-
loader=LoadHF(path="stanfordnlp/coqa", all_splits=["train", "validation"]),
9+
loader=LoadHF(path="stanfordnlp/coqa"),
1010
preprocess_steps=[
1111
"splitters.small_no_test",
1212
Set(fields={"context_type": "story"}),
@@ -62,7 +62,7 @@
6262
add_to_catalog(card, "cards.coqa.qa", overwrite=True)
6363

6464
card = TaskCard(
65-
loader=LoadHF(path="stanfordnlp/coqa", all_splits=["train", "validation"]),
65+
loader=LoadHF(path="stanfordnlp/coqa"),
6666
preprocess_steps=[
6767
"splitters.small_no_test",
6868
Set(fields={"context_type": "dialog", "completion_type": "response"}),

prepare/cards/dart.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from unitxt.test_utils.card import test_card
1111

1212
card = TaskCard(
13-
loader=LoadHF(path="dart", all_splits=["train", "validation", "test"]),
13+
loader=LoadHF(path="dart"),
1414
preprocess_steps=[
1515
"splitters.small_no_test",
1616
SerializeTriples(field_to_field=[["tripleset", "serialized_triples"]]),

prepare/cards/dbpedia_14.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
mappers = {str(i): cls for i, cls in enumerate(classes)}
3535

3636
card = TaskCard(
37-
loader=LoadHF(path=f"{dataset_name}", all_splits=["train", "test"]),
37+
loader=LoadHF(path=f"{dataset_name}"),
3838
preprocess_steps=[
3939
Shuffle(page_size=sys.maxsize),
4040
SplitRandomMix(

0 commit comments

Comments
 (0)