|
16 | 16 | when possible, but also determine when no answer is supported by the paragraph |
17 | 17 | and abstain from answering. |
18 | 18 |
|
| 19 | +note: |
| 20 | +This is an LLM-friendly adaptation of the original SQuAD 2.0 evaluation. |
| 21 | +The original evaluation uses extractive span selection with a confidence-based |
| 22 | +"no answer" threshold, which does not apply to generative models. |
| 23 | +Here, the model is instead instructed to generate "unanswerable" when the |
| 24 | +question cannot be answered from the context. EM and F1 metrics are computed |
| 25 | +over both answerable and unanswerable questions. |
| 26 | +
|
19 | 27 | languages: |
20 | 28 | english |
21 | 29 |
|
|
28 | 36 |
|
29 | 37 | from lighteval.metrics.metrics import Metrics |
30 | 38 | from lighteval.tasks.lighteval_task import LightevalTaskConfig |
| 39 | +from lighteval.tasks.requests import Doc |
31 | 40 | from lighteval.tasks.templates.qa import get_qa_prompt_function |
32 | 41 | from lighteval.utils.language import Language |
33 | 42 |
|
| 43 | +UNANSWERABLE = "unanswerable" |
| 44 | + |
| 45 | + |
| 46 | +def squad_v2_prompt(line, task_name: str = None): |
| 47 | + answers = list(set(ans for ans in line["answers"]["text"] if len(ans) > 0)) |
| 48 | + is_unanswerable = len(answers) == 0 |
| 49 | + |
| 50 | + if is_unanswerable: |
| 51 | + choices = [f" {UNANSWERABLE}"] |
| 52 | + else: |
| 53 | + choices = [f" {ans}" for ans in answers] |
| 54 | + |
| 55 | + return Doc( |
| 56 | + task_name=task_name, |
| 57 | + query=f"Context: {line['context']}\nQuestion: {line['question']}\n" |
| 58 | + f"Answer with a span from the context, or \"{UNANSWERABLE}\" if the question cannot be answered.\nAnswer:", |
| 59 | + choices=choices, |
| 60 | + gold_index=list(range(len(choices))), |
| 61 | + ) |
| 62 | + |
34 | 63 |
|
35 | 64 | squad_v2 = LightevalTaskConfig( |
36 | 65 | name="squad_v2", |
| 66 | + prompt_function=squad_v2_prompt, |
| 67 | + hf_repo="rajpurkar/squad_v2", |
| 68 | + hf_subset="squad_v2", |
| 69 | + evaluation_splits=("validation",), |
| 70 | + few_shots_split="train", |
| 71 | + stop_sequence=["\n", "Question:", "question:"], |
| 72 | + generation_size=200, |
| 73 | + metrics=[Metrics.exact_match, Metrics.f1_score], |
| 74 | + version=2, |
| 75 | +) |
| 76 | + |
| 77 | +squad_v2_answerable = LightevalTaskConfig( |
| 78 | + name="squad_v2:answerable", |
37 | 79 | prompt_function=get_qa_prompt_function( |
38 | 80 | Language.ENGLISH, |
39 | 81 | lambda line: { |
|
49 | 91 | few_shots_split="train", |
50 | 92 | stop_sequence=["\n", "Question:", "question:"], |
51 | 93 | generation_size=200, |
52 | | - metrics=[Metrics.exact_match], |
| 94 | + metrics=[Metrics.exact_match, Metrics.f1_score], |
53 | 95 | version=1, |
54 | 96 | ) |
55 | 97 |
|
56 | 98 | TASKS_TABLE = [ |
57 | 99 | squad_v2, |
| 100 | + squad_v2_answerable, |
58 | 101 | ] |
0 commit comments