-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathbioasq.py
106 lines (93 loc) · 3.4 KB
/
bioasq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import json
from unitxt import add_to_catalog
from unitxt.blocks import TaskCard
from unitxt.collections_operators import Wrap
from unitxt.loaders import LoadHF
from unitxt.operators import Cast, Copy
from unitxt.splitters import RenameSplits
from unitxt.templates import InputOutputTemplate
from unitxt.test_utils.card import test_card
card = TaskCard(
loader=LoadHF(
path="enelpol/rag-mini-bioasq",
name="question-answer-passages",
data_classification_policy=["public"],
),
preprocess_steps=[
Copy(
field_to_field={
"question": "question",
"id": "question_id",
},
),
Cast(
field="relevant_passage_ids",
to="str",
to_field="reference_context_ids",
process_every_value=True,
),
Wrap(
field="answer",
inside="list",
to_field="reference_answers",
),
],
task="tasks.rag.end_to_end",
templates={"default": "templates.rag.end_to_end.json_predictions"},
__tags__={"license": "cc-by-2.5", "url":"https://huggingface.co/datasets/enelpol/rag-mini-bioasq"},
__description__="""This dataset is a subset of a training dataset by the BioASQ Challenge, which is available here.
It is derived from rag-datasets/rag-mini-bioasq.
Modifications include:
filling in missing passages (some of them contained "nan" instead of actual text),
changing relevant_passage_ids' type from string to sequence of ints,
deduplicating the passages (removed 40 duplicates) and fixing the relevant_passage_ids in QAP triplets to point to the corrected, deduplicated passages' ids,
splitting QAP triplets into train and test splits.
""",
)
wrong_answer = {
"contexts": ["hi"],
"is_answerable": True,
"answer": "Don't know",
"context_ids": ["id0"],
}
test_card(
card,
strict=True,
full_mismatch_prediction_values=[json.dumps(wrong_answer)],
debug=False,
demos_taken_from="test",
demos_pool_size=5,
)
add_to_catalog(card, "cards.rag.benchmark.bioasq.en", overwrite=True)
# Documents
card = TaskCard(
loader=LoadHF(
path="enelpol/rag-mini-bioasq",
name="text-corpus",
data_classification_policy=["public"],
),
preprocess_steps=[
RenameSplits({"test": "train"}),
Cast(field="id", to="str"),
Copy(field="id", to_field="document_id"),
Wrap(field="passage", inside="list", to_field="passages"),
],
task="tasks.rag.corpora",
templates={
"empty": InputOutputTemplate(
input_format="",
output_format="",
),
},
__tags__={"license": "cc-by-2.5", "url" : "https://huggingface.co/datasets/enelpol/rag-mini-bioasq"},
__description__="""This dataset is a subset of a training dataset by the BioASQ Challenge, which is available here.
It is derived from rag-datasets/rag-mini-bioasq.
Modifications include:
filling in missing passages (some of them contained "nan" instead of actual text),
changing relevant_passage_ids' type from string to sequence of ints,
deduplicating the passages (removed 40 duplicates) and fixing the relevant_passage_ids in QAP triplets to point to the corrected, deduplicated passages' ids,
splitting QAP triplets into train and test splits.
""",
)
# Not testing card, because documents are not evaluated.
add_to_catalog(card, "cards.rag.documents.bioasq.en", overwrite=True)