-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathhotpotqa.py
128 lines (120 loc) · 4.64 KB
/
hotpotqa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import json
from unitxt import add_to_catalog
from unitxt.blocks import TaskCard
from unitxt.collections_operators import Explode, Wrap
from unitxt.loaders import LoadHF
from unitxt.operators import (
Copy,
Deduplicate,
Set,
ZipFieldValues,
)
from unitxt.splitters import SplitRandomMix
from unitxt.string_operators import Join, Replace
from unitxt.templates import InputOutputTemplate
from unitxt.test_utils.card import test_card
# Benchmark
benchmark_card = TaskCard(
loader=LoadHF(
path="hotpotqa/hotpot_qa",
name="distractor",
data_classification_policy=["public"],
),
preprocess_steps=[
SplitRandomMix(
{
"test": "train[30%]",
"train": "train[70%]",
}),
Copy(
field_to_field={
"question": "question",
"id": "question_id",
"level": "metadata_tags/level"
},
),
Copy(
field="context/title",
to_field="reference_context_ids",
),
Join(
field="context/sentences",
by=" ",
to_field="reference_contexts",
process_every_value=True,
),
Set(
fields={
"is_answerable_label": True,
}
),
Wrap(
field="answer",
inside="list",
to_field="reference_answers",
),
],
task="tasks.rag.end_to_end",
templates={"default": "templates.rag.end_to_end.json_predictions"},
__tags__={"license": "CC BY-SA 4.0", "url": "https://huggingface.co/datasets/BeIR/hotpotqa"},
__description__="""HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering.
HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems ability to extract relevant facts and perform necessary comparison.
""",
)
wrong_answer = {
"contexts": ["hi"],
"is_answerable": True,
"answer": "Don't know",
"context_ids": ["id0"],
}
test_card(
benchmark_card,
strict=True,
full_mismatch_prediction_values=[json.dumps(wrong_answer)],
debug=False,
demos_taken_from="test",
demos_pool_size=5,
)
add_to_catalog(benchmark_card, "cards.rag.benchmark.hotpotqa.en", overwrite=True)
# Documents
documents_card = TaskCard(
loader=LoadHF(
path="hotpotqa/hotpot_qa",
name="distractor",
data_classification_policy=["public"],
),
preprocess_steps=[
Join(
field="context/sentences",
by=" ",
to_field="context_sentences",
process_every_value=True,
),
ZipFieldValues(
fields= ["context/title", "context_sentences"],
to_field = "documents"
),
Explode(
field = "documents",
to_field = "document"
),
Copy(field="document/0", to_field="document_id"),
Copy(field="document/0", to_field="title"),
Replace(field="document/1",old="\xa0", new = " "),
Wrap(field="document/1", inside="list", to_field="passages"),
Deduplicate(by=["document_id"]),
],
task="tasks.rag.corpora",
templates={
"empty": InputOutputTemplate(
input_format="",
output_format="",
),
},
__tags__={"license": "CC BY-SA 4.0", "url": "https://huggingface.co/datasets/BeIR/hotpotqa"},
__description__="""HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering.
HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems ability to extract relevant facts and perform necessary comparison.
""",
)
# Not testing card, because documents are not evaluated.
add_to_catalog(documents_card, "cards.rag.documents.hotpotqa.en", overwrite=True)