Skip to content

Commit e4bdf48

Browse files
BenjSzelronbandel
andauthored
add tags information - url (#1691)
Co-authored-by: Elron Bandel <[email protected]>
1 parent cdf5b82 commit e4bdf48

File tree

12 files changed

+32
-12
lines changed

12 files changed

+32
-12
lines changed

prepare/cards/rag/end_to_end/bioasq.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
],
3737
task="tasks.rag.end_to_end",
3838
templates={"default": "templates.rag.end_to_end.json_predictions"},
39-
__tags__={"license": "cc-by-2.5"},
39+
__tags__={"license": "cc-by-2.5", "url":"https://huggingface.co/datasets/enelpol/rag-mini-bioasq"},
4040
__description__="""This dataset is a subset of a training dataset by the BioASQ Challenge, which is available here.
4141
4242
It is derived from rag-datasets/rag-mini-bioasq.
@@ -88,7 +88,7 @@
8888
output_format="",
8989
),
9090
},
91-
__tags__={"license": "cc-by-2.5"},
91+
__tags__={"license": "cc-by-2.5", "url" : "https://huggingface.co/datasets/enelpol/rag-mini-bioasq"},
9292
__description__="""This dataset is a subset of a training dataset by the BioASQ Challenge, which is available here.
9393
9494
It is derived from rag-datasets/rag-mini-bioasq.

prepare/cards/rag/end_to_end/clapnq.py

+4
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ class ClapNqBenchmark:
4242
),
4343
],
4444
task="tasks.rag.end_to_end",
45+
__tags__={"license": "Apache License 2.0", "url": "https://huggingface.co/datasets/PrimeQA/clapnq"},
46+
__description__="""CLAP NQ is created from the subset of Natural Questions (NQ) that have a long answer but no short answer. NQ consists of ~380k examples. There are ~30k questions that are long answers without short answers excluding tables and lists. To increases the likelihood of longer answers we only explored ones that have more than 5 sentences in the passage. The subset that was annotated consists of ~12k examples. All examples where cohesion of non-consecutive sentences was required for the answer were annotated a second time. The final dataset is made up of all data that went through two rounds of annotation. (We provide the single round annotations as well - it is only training data) An equal amount of unanswerable questions have also been added from the original NQ train/dev sets. Details about the annotation task and unanswerables can be found at https://github.com/primeqa/clapnq/blob/main/annotated_data.""",
4547
# templates=["templates.empty"],
4648
templates={"default": "templates.rag.end_to_end.json_predictions"},
4749
)
@@ -87,6 +89,8 @@ class ClapNqBenchmark:
8789
),
8890
],
8991
task="tasks.rag.corpora",
92+
__tags__={"license": "Apache License 2.0", "url":"https://huggingface.co/datasets/PrimeQA/clapnq"},
93+
__description__="""CLAP NQ is created from the subset of Natural Questions (NQ) that have a long answer but no short answer. NQ consists of ~380k examples. There are ~30k questions that are long answers without short answers excluding tables and lists. To increases the likelihood of longer answers we only explored ones that have more than 5 sentences in the passage. The subset that was annotated consists of ~12k examples. All examples where cohesion of non-consecutive sentences was required for the answer were annotated a second time. The final dataset is made up of all data that went through two rounds of annotation. (We provide the single round annotations as well - it is only training data) An equal amount of unanswerable questions have also been added from the original NQ train/dev sets. Details about the annotation task and unanswerables can be found at https://github.com/primeqa/clapnq/blob/main/annotated_data.""",
9094
templates={
9195
"empty": InputOutputTemplate(
9296
input_format="",

prepare/cards/rag/end_to_end/hotpotqa.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
],
5959
task="tasks.rag.end_to_end",
6060
templates={"default": "templates.rag.end_to_end.json_predictions"},
61-
__tags__={"license": "CC BY-SA 4.0"},
61+
__tags__={"license": "CC BY-SA 4.0", "url": "https://huggingface.co/datasets/BeIR/hotpotqa"},
6262
__description__="""HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering.
6363
HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems ability to extract relevant facts and perform necessary comparison.
6464
""",
@@ -118,7 +118,7 @@
118118
output_format="",
119119
),
120120
},
121-
__tags__={"license": "CC BY-SA 4.0"},
121+
__tags__={"license": "CC BY-SA 4.0", "url": "https://huggingface.co/datasets/BeIR/hotpotqa"},
122122
__description__="""HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering.
123123
HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems ability to extract relevant facts and perform necessary comparison.
124124
""",

prepare/cards/rag/end_to_end/miniwikipedia.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
],
2828
task="tasks.rag.end_to_end",
2929
templates={"default": "templates.rag.end_to_end.json_predictions"},
30-
__tags__={"license": "cc-by-sa-3.0"},
30+
__tags__={"license": "cc-by-2.5", "url":"https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia/"},
3131
__description__="""This dataset, a subset generated by the RAG-Datasets team, supports research in question answering by providing questions and answers derived from Wikipedia articles, along with difficulty ratings assigned by both question writers and answerers. It includes files for questions from three student cohorts (S08, S09, and S10) and 690,000 words of cleaned Wikipedia text, facilitating exploration of question generation and answering tasks.""",
3232
)
3333

@@ -72,7 +72,7 @@
7272
output_format="",
7373
),
7474
},
75-
__tags__={"license": "cc-by-2.5"},
75+
__tags__={"license": "cc-by-2.5", "url":"https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia/"},
7676
__description__="""This dataset, a subset generated by the RAG-Datasets team, supports research in question answering by providing questions and answers derived from Wikipedia articles, along with difficulty ratings assigned by both question writers and answerers. It includes files for questions from three student cohorts (S08, S09, and S10) and 690,000 words of cleaned Wikipedia text, facilitating exploration of question generation and answering tasks.""",
7777
)
7878

src/unitxt/catalog/cards/rag/benchmark/bioasq/en.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
"default": "templates.rag.end_to_end.json_predictions"
3636
},
3737
"__tags__": {
38-
"license": "cc-by-2.5"
38+
"license": "cc-by-2.5",
39+
"url": "https://huggingface.co/datasets/enelpol/rag-mini-bioasq"
3940
},
4041
"__description__": "This dataset is a subset of a training dataset by the BioASQ Challenge, which is available here.\n\nIt is derived from rag-datasets/rag-mini-bioasq.\n\nModifications include:\n\nfilling in missing passages (some of them contained \"nan\" instead of actual text),\nchanging relevant_passage_ids' type from string to sequence of ints,\ndeduplicating the passages (removed 40 duplicates) and fixing the relevant_passage_ids in QAP triplets to point to the corrected, deduplicated passages' ids,\nsplitting QAP triplets into train and test splits.\n"
4142
}

src/unitxt/catalog/cards/rag/benchmark/clap_nq/en.json

+5
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@
3535
}
3636
],
3737
"task": "tasks.rag.end_to_end",
38+
"__tags__": {
39+
"license": "Apache License 2.0",
40+
"url": "https://huggingface.co/datasets/PrimeQA/clapnq"
41+
},
42+
"__description__": "CLAP NQ is created from the subset of Natural Questions (NQ) that have a long answer but no short answer. NQ consists of ~380k examples. There are ~30k questions that are long answers without short answers excluding tables and lists. To increases the likelihood of longer answers we only explored ones that have more than 5 sentences in the passage. The subset that was annotated consists of ~12k examples. All examples where cohesion of non-consecutive sentences was required for the answer were annotated a second time. The final dataset is made up of all data that went through two rounds of annotation. (We provide the single round annotations as well - it is only training data) An equal amount of unanswerable questions have also been added from the original NQ train/dev sets. Details about the annotation task and unanswerables can be found at https://github.com/primeqa/clapnq/blob/main/annotated_data.",
3843
"templates": {
3944
"default": "templates.rag.end_to_end.json_predictions"
4045
}

src/unitxt/catalog/cards/rag/benchmark/hotpotqa/en.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
"default": "templates.rag.end_to_end.json_predictions"
5555
},
5656
"__tags__": {
57-
"license": "CC BY-SA 4.0"
57+
"license": "CC BY-SA 4.0",
58+
"url": "https://huggingface.co/datasets/BeIR/hotpotqa"
5859
},
5960
"__description__": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering.\nHotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems ability to extract relevant facts and perform necessary comparison.\n "
6061
}

src/unitxt/catalog/cards/rag/benchmark/miniwiki/en.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
"default": "templates.rag.end_to_end.json_predictions"
3434
},
3535
"__tags__": {
36-
"license": "cc-by-sa-3.0"
36+
"license": "cc-by-2.5",
37+
"url": "https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia/"
3738
},
3839
"__description__": "This dataset, a subset generated by the RAG-Datasets team, supports research in question answering by providing questions and answers derived from Wikipedia articles, along with difficulty ratings assigned by both question writers and answerers. It includes files for questions from three student cohorts (S08, S09, and S10) and 690,000 words of cleaned Wikipedia text, facilitating exploration of question generation and answering tasks."
3940
}

src/unitxt/catalog/cards/rag/documents/bioasq/en.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
}
4242
},
4343
"__tags__": {
44-
"license": "cc-by-2.5"
44+
"license": "cc-by-2.5",
45+
"url": "https://huggingface.co/datasets/enelpol/rag-mini-bioasq"
4546
},
4647
"__description__": "This dataset is a subset of a training dataset by the BioASQ Challenge, which is available here.\n\nIt is derived from rag-datasets/rag-mini-bioasq.\n\nModifications include:\n\nfilling in missing passages (some of them contained \"nan\" instead of actual text),\nchanging relevant_passage_ids' type from string to sequence of ints,\ndeduplicating the passages (removed 40 duplicates) and fixing the relevant_passage_ids in QAP triplets to point to the corrected, deduplicated passages' ids,\nsplitting QAP triplets into train and test splits.\n"
4748
}

src/unitxt/catalog/cards/rag/documents/clap_nq/en.json

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@
3030
}
3131
],
3232
"task": "tasks.rag.corpora",
33+
"__tags__": {
34+
"license": "Apache License 2.0",
35+
"url": "https://huggingface.co/datasets/PrimeQA/clapnq"
36+
},
37+
"__description__": "CLAP NQ is created from the subset of Natural Questions (NQ) that have a long answer but no short answer. NQ consists of ~380k examples. There are ~30k questions that are long answers without short answers excluding tables and lists. To increases the likelihood of longer answers we only explored ones that have more than 5 sentences in the passage. The subset that was annotated consists of ~12k examples. All examples where cohesion of non-consecutive sentences was required for the answer were annotated a second time. The final dataset is made up of all data that went through two rounds of annotation. (We provide the single round annotations as well - it is only training data) An equal amount of unanswerable questions have also been added from the original NQ train/dev sets. Details about the annotation task and unanswerables can be found at https://github.com/primeqa/clapnq/blob/main/annotated_data.",
3338
"templates": {
3439
"empty": {
3540
"__type__": "input_output_template",

src/unitxt/catalog/cards/rag/documents/hotpotqa/en.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@
6767
}
6868
},
6969
"__tags__": {
70-
"license": "CC BY-SA 4.0"
70+
"license": "CC BY-SA 4.0",
71+
"url": "https://huggingface.co/datasets/BeIR/hotpotqa"
7172
},
7273
"__description__": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering.\nHotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems ability to extract relevant facts and perform necessary comparison.\n"
7374
}

src/unitxt/catalog/cards/rag/documents/miniwiki/en.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
}
3838
},
3939
"__tags__": {
40-
"license": "cc-by-2.5"
40+
"license": "cc-by-2.5",
41+
"url": "https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia/"
4142
},
4243
"__description__": "This dataset, a subset generated by the RAG-Datasets team, supports research in question answering by providing questions and answers derived from Wikipedia articles, along with difficulty ratings assigned by both question writers and answerers. It includes files for questions from three student cohorts (S08, S09, and S10) and 690,000 words of cleaned Wikipedia text, facilitating exploration of question generation and answering tasks."
4344
}

0 commit comments

Comments
 (0)