Skip to content

Commit c3e3675

Browse files
authored
feat: integrate worldvqa benchmark task (#1168)
1 parent 71dd188 commit c3e3675

7 files changed

Lines changed: 199 additions & 0 deletions

File tree

docs/current_tasks.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ python -m lmms_eval --tasks list_with_num
333333
- [WorldQA](https://zhangyuanhan-ai.github.io/WorldQA/) (worldqa)
334334
- WorldQA Generation (worldqa_gen)
335335
- WorldQA Multiple Choice (worldqa_mc)
336+
- [WorldVQA](https://huggingface.co/datasets/moonshotai/WorldVQA) (worldvqa)
337+
- WorldQA Compatibility Generation (worldvqa_gen)
338+
- WorldQA Compatibility Multiple Choice (worldvqa_mc)
336339
- [YouCook2](http://youcook2.eecs.umich.edu/) (youcook2_val)
337340

338341
### Long Video & Temporal Understanding
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
dataset_path: lmms-lab/worldqa
2+
dataset_kwargs:
3+
token: True
4+
video: True
5+
cache_dir: multi-hop-reasoning
6+
metadata:
7+
version: 0.0
8+
gpt_eval_model_name: "gpt-4-0613"

lmms_eval/tasks/worldvqa/utils.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import base64
2+
import io
3+
import os
4+
5+
from PIL import Image
6+
7+
from lmms_eval.tasks.worldqa.utils import (
8+
MultiChoiceRegexFilter,
9+
worldq_gen_gpt_eval,
10+
worldqa_aggregate_gen,
11+
worldqa_aggregate_mc,
12+
worldqa_aggregate_mc_eval,
13+
worldqa_aggregate_mc_ppl,
14+
worldqa_doc_to_answer,
15+
worldqa_doc_to_answer_mc,
16+
worldqa_doc_to_answer_mc_ppl,
17+
worldqa_doc_to_choice,
18+
worldqa_doc_to_text,
19+
worldqa_doc_to_visual,
20+
worldqa_process_results,
21+
worldqa_process_results_mc,
22+
)
23+
24+
25+
def worldvqa_doc_to_visual(doc):
26+
if "image" in doc and doc["image"] is not None:
27+
image = doc["image"]
28+
if isinstance(image, Image.Image):
29+
return [image.convert("RGB")]
30+
if isinstance(image, str):
31+
if os.path.exists(image):
32+
return [Image.open(image).convert("RGB")]
33+
decoded = Image.open(io.BytesIO(base64.b64decode(image))).convert("RGB")
34+
return [decoded]
35+
if isinstance(image, dict):
36+
image_path = image.get("path")
37+
if image_path and os.path.exists(image_path):
38+
return [Image.open(image_path).convert("RGB")]
39+
image_bytes = image.get("bytes")
40+
if image_bytes is not None:
41+
return [Image.open(io.BytesIO(image_bytes)).convert("RGB")]
42+
43+
video = doc.get("video")
44+
if isinstance(video, str) and video:
45+
return [video]
46+
if isinstance(video, dict):
47+
video_path = video.get("path")
48+
if video_path:
49+
return [video_path]
50+
51+
try:
52+
return worldqa_doc_to_visual(doc)
53+
except SystemExit:
54+
video_idx = doc.get("video_idx")
55+
if not video_idx:
56+
return []
57+
hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
58+
return [os.path.join(hf_home, "multi-hop-reasoning", "videos", f"{video_idx}.mp4")]
59+
60+
61+
def worldvqa_doc_to_text(doc, lmms_eval_specific_kwargs=None):
62+
if "option" in doc or "video_idx" in doc:
63+
return worldqa_doc_to_text(doc, lmms_eval_specific_kwargs=lmms_eval_specific_kwargs)
64+
65+
if lmms_eval_specific_kwargs is None:
66+
lmms_eval_specific_kwargs = {}
67+
68+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
69+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
70+
return f"{pre_prompt}{doc['question'].strip()}{post_prompt}"
71+
72+
73+
worldvqa_doc_to_answer = worldqa_doc_to_answer
74+
worldvqa_doc_to_answer_mc = worldqa_doc_to_answer_mc
75+
worldvqa_doc_to_answer_mc_ppl = worldqa_doc_to_answer_mc_ppl
76+
worldvqa_doc_to_choice = worldqa_doc_to_choice
77+
worldvqa_process_results = worldqa_process_results
78+
worldvqa_process_results_mc = worldqa_process_results_mc
79+
worldvqa_aggregate_gen = worldqa_aggregate_gen
80+
worldvqa_aggregate_mc = worldqa_aggregate_mc
81+
worldvqa_aggregate_mc_eval = worldqa_aggregate_mc_eval
82+
worldvqa_aggregate_mc_ppl = worldqa_aggregate_mc_ppl
83+
worldvqa_gen_gpt_eval = worldq_gen_gpt_eval
84+
85+
__all__ = [
86+
"MultiChoiceRegexFilter",
87+
"worldvqa_doc_to_visual",
88+
"worldvqa_doc_to_text",
89+
"worldvqa_doc_to_answer",
90+
"worldvqa_doc_to_answer_mc",
91+
"worldvqa_doc_to_answer_mc_ppl",
92+
"worldvqa_doc_to_choice",
93+
"worldvqa_process_results",
94+
"worldvqa_process_results_mc",
95+
"worldvqa_aggregate_gen",
96+
"worldvqa_aggregate_mc",
97+
"worldvqa_aggregate_mc_eval",
98+
"worldvqa_aggregate_mc_ppl",
99+
"worldvqa_gen_gpt_eval",
100+
]
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
dataset_path: moonshotai/WorldVQA
2+
dataset_kwargs:
3+
token: False
4+
task: "worldvqa"
5+
test_split: train
6+
output_type: generate_until
7+
doc_to_visual: !function utils.worldvqa_doc_to_visual
8+
doc_to_text: !function utils.worldvqa_doc_to_text
9+
doc_to_target: "answer"
10+
generation_kwargs:
11+
max_new_tokens: 64
12+
temperature: 0
13+
top_p: 1.0
14+
num_beams: 1
15+
do_sample: false
16+
metric_list:
17+
- metric: exact_match
18+
aggregation: mean
19+
higher_is_better: true
20+
ignore_case: true
21+
ignore_punctuation: true
22+
lmms_eval_specific_kwargs:
23+
default:
24+
pre_prompt: ""
25+
post_prompt: "\nAnswer briefly."
26+
metadata:
27+
version: 0.0
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
dataset_name: "Generation"
2+
task: "worldvqa_gen"
3+
test_split: test
4+
output_type: generate_until
5+
doc_to_visual: !function utils.worldvqa_doc_to_visual
6+
doc_to_text: !function utils.worldvqa_doc_to_text
7+
doc_to_target: !function utils.worldvqa_doc_to_answer
8+
process_results: !function utils.worldvqa_process_results
9+
metric_list:
10+
- metric: submission
11+
aggregation: !function utils.worldvqa_aggregate_gen
12+
higher_is_better: true
13+
- metric: gpt_eval
14+
aggregation: !function utils.worldvqa_gen_gpt_eval
15+
higher_is_better: true
16+
lmms_eval_specific_kwargs:
17+
default:
18+
pre_prompt: ""
19+
post_prompt: ""
20+
include: _default_template_yaml
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
dataset_name: "MC"
2+
task: "worldvqa_mc"
3+
test_split: test
4+
output_type: generate_until
5+
doc_to_visual: !function utils.worldvqa_doc_to_visual
6+
doc_to_text: !function utils.worldvqa_doc_to_text
7+
doc_to_target: !function utils.worldvqa_doc_to_answer_mc
8+
process_results: !function utils.worldvqa_process_results_mc
9+
metric_list:
10+
- metric: gpt_eval
11+
aggregation: !function utils.worldvqa_aggregate_mc_eval
12+
higher_is_better: true
13+
lmms_eval_specific_kwargs:
14+
default:
15+
pre_prompt: ""
16+
post_prompt: "\nAnswer with the option's letter from the given choices directly."
17+
filter_list:
18+
- name: "flexible-extract"
19+
filter:
20+
- function: !function utils.MultiChoiceRegexFilter
21+
group_select: 0
22+
ignore_case: true
23+
ignore_punctuation: true
24+
regex_pattern: "(\\([A-Z]\\))"
25+
26+
include: _default_template_yaml
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
dataset_name: "MC_PPL"
2+
task: "worldvqa_mc_ppl"
3+
test_split: test
4+
output_type: multiple_choice
5+
doc_to_visual: !function utils.worldvqa_doc_to_visual
6+
doc_to_text: "question"
7+
doc_to_target: !function utils.worldvqa_doc_to_answer_mc_ppl
8+
doc_to_choice: !function utils.worldvqa_doc_to_choice
9+
metric_list:
10+
- metric: acc
11+
lmms_eval_specific_kwargs:
12+
default:
13+
pre_prompt: ""
14+
post_prompt: ""
15+
include: _default_template_yaml

0 commit comments

Comments
 (0)