Skip to content

Commit b42fb73

Browse files
squad_v2.py: include unanswerable questions in evaluation Fixes #1184
1 parent 33acf35 commit b42fb73

File tree

1 file changed

+44
-1
lines changed

1 file changed

+44
-1
lines changed

src/lighteval/tasks/tasks/squad_v2.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616
when possible, but also determine when no answer is supported by the paragraph
1717
and abstain from answering.
1818
19+
note:
20+
This is an LLM-friendly adaptation of the original SQuAD 2.0 evaluation.
21+
The original evaluation uses extractive span selection with a confidence-based
22+
"no answer" threshold, which does not apply to generative models.
23+
Here, the model is instead instructed to generate "unanswerable" when the
24+
question cannot be answered from the context. EM and F1 metrics are computed
25+
over both answerable and unanswerable questions.
26+
1927
languages:
2028
english
2129
@@ -28,12 +36,46 @@
2836

2937
from lighteval.metrics.metrics import Metrics
3038
from lighteval.tasks.lighteval_task import LightevalTaskConfig
39+
from lighteval.tasks.requests import Doc
3140
from lighteval.tasks.templates.qa import get_qa_prompt_function
3241
from lighteval.utils.language import Language
3342

43+
UNANSWERABLE = "unanswerable"
44+
45+
46+
def squad_v2_prompt(line, task_name: str = None):
47+
answers = list(set(ans for ans in line["answers"]["text"] if len(ans) > 0))
48+
is_unanswerable = len(answers) == 0
49+
50+
if is_unanswerable:
51+
choices = [f" {UNANSWERABLE}"]
52+
else:
53+
choices = [f" {ans}" for ans in answers]
54+
55+
return Doc(
56+
task_name=task_name,
57+
query=f"Context: {line['context']}\nQuestion: {line['question']}\n"
58+
f"Answer with a span from the context, or \"{UNANSWERABLE}\" if the question cannot be answered.\nAnswer:",
59+
choices=choices,
60+
gold_index=list(range(len(choices))),
61+
)
62+
3463

3564
squad_v2 = LightevalTaskConfig(
3665
name="squad_v2",
66+
prompt_function=squad_v2_prompt,
67+
hf_repo="rajpurkar/squad_v2",
68+
hf_subset="squad_v2",
69+
evaluation_splits=("validation",),
70+
few_shots_split="train",
71+
stop_sequence=["\n", "Question:", "question:"],
72+
generation_size=200,
73+
metrics=[Metrics.exact_match, Metrics.f1_score],
74+
version=2,
75+
)
76+
77+
squad_v2_answerable = LightevalTaskConfig(
78+
name="squad_v2:answerable",
3779
prompt_function=get_qa_prompt_function(
3880
Language.ENGLISH,
3981
lambda line: {
@@ -49,10 +91,11 @@
4991
few_shots_split="train",
5092
stop_sequence=["\n", "Question:", "question:"],
5193
generation_size=200,
52-
metrics=[Metrics.exact_match],
94+
metrics=[Metrics.exact_match, Metrics.f1_score],
5395
version=1,
5496
)
5597

5698
TASKS_TABLE = [
5799
squad_v2,
100+
squad_v2_answerable,
58101
]

0 commit comments

Comments
 (0)