| 
1 | 1 | from unitxt.benchmark import Benchmark  | 
2 | 2 | from unitxt.catalog import add_to_catalog  | 
3 | 3 | from unitxt.standard import DatasetRecipe  | 
4 |  | -from unitxt.templates import MultipleChoiceTemplate, MultiReferenceTemplate  | 
5 | 4 | 
 
  | 
6 |  | -ai2d_llama_vision_template = MultipleChoiceTemplate(  | 
7 |  | -    input_format="{context} Look at the scientific diagram carefully and answer the following question: {question}\n{choices}\nRespond only with the correct option digit.",  | 
8 |  | -    choices_separator="\n",  | 
9 |  | -    target_field="answer",  | 
10 |  | -    enumerator="capitals",  | 
11 |  | -)  | 
12 |  | -doc_vqa_llama_vision_template = MultiReferenceTemplate(  | 
13 |  | -    input_format="{context} Read the text in the image carefully and answer the question with the text as seen exactly in the image."  | 
14 |  | -                 " For yes/no questions, just respond Yes or No. If the answer is numeric, just respond with the number and nothing else. "  | 
15 |  | -                 "If the answer has multiple words, just respond with the words and absolutely nothing else. Never respond in a sentence or a phrase.\n Question: {question}",  | 
16 |  | -    references_field="answers",  | 
17 |  | -)  | 
18 |  | -chart_qa_llama_vision_template = MultiReferenceTemplate(  | 
19 |  | -    input_format="{context} {question}\nAnswer the question with a single word.",  | 
20 |  | -    references_field="answers",  | 
21 |  | -    __description__="lmms-evals default template for chartqa.",  | 
22 |  | -)  | 
23 | 5 | benchmark = Benchmark(  | 
24 | 6 |     subsets={  | 
25 | 7 |         "doc_vqa_default": DatasetRecipe(  | 
 | 
39 | 21 |         ),  | 
40 | 22 |         "doc_vqa_llama_vision_template": DatasetRecipe(  | 
41 | 23 |             card="cards.doc_vqa.lmms_eval",  | 
42 |  | -            template=doc_vqa_llama_vision_template,  | 
 | 24 | +            template="templates.qa.llama_vision.with_context.doc_vqa",  | 
43 | 25 |             format="formats.chat_api",  | 
44 | 26 |         ),  | 
45 | 27 |         "info_vqa_llama_vision_template": DatasetRecipe(  | 
46 | 28 |             card="cards.info_vqa_lmms_eval",  | 
47 |  | -            template=doc_vqa_llama_vision_template,  | 
 | 29 | +            template="templates.qa.llama_vision.with_context.info_vqa",  | 
48 | 30 |             format="formats.chat_api",  | 
49 | 31 |         ),  | 
50 | 32 |         "chart_qa_llama_vision_template": DatasetRecipe(  | 
51 | 33 |             card="cards.chart_qa_lmms_eval",  | 
52 |  | -            template=chart_qa_llama_vision_template,  | 
 | 34 | +            template="templates.qa.llama_vision.with_context.chart_qa",  | 
53 | 35 |             format="formats.chat_api",  | 
54 | 36 |         ),  | 
55 | 37 |         "ai2d_llama_vision_template": DatasetRecipe(  | 
56 | 38 |             card="cards.ai2d",  | 
57 |  | -            template=ai2d_llama_vision_template,  | 
 | 39 | +            template="templates.qa.llama_vision.multiple_choice.with_context.ai2d",  | 
58 | 40 |             format="formats.chat_api",  | 
59 | 41 |         ),  | 
60 | 42 |     },  | 
 | 
0 commit comments