Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ python -m lmms_eval --tasks list_with_num
- [WorldQA](https://zhangyuanhan-ai.github.io/WorldQA/) (worldqa)
- WorldQA Generation (worldqa_gen)
- WorldQA Multiple Choice (worldqa_mc)
- [WorldVQA](https://huggingface.co/datasets/moonshotai/WorldVQA) (worldvqa)
- WorldQA Compatibility Generation (worldvqa_gen)
- WorldQA Compatibility Multiple Choice (worldvqa_mc)
- [YouCook2](http://youcook2.eecs.umich.edu/) (youcook2_val)

### Long Video & Temporal Understanding
Expand Down
8 changes: 8 additions & 0 deletions lmms_eval/tasks/worldvqa/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
dataset_path: lmms-lab/worldqa
dataset_kwargs:
token: True
video: True
cache_dir: multi-hop-reasoning
metadata:
version: 0.0
gpt_eval_model_name: "gpt-4-0613"
100 changes: 100 additions & 0 deletions lmms_eval/tasks/worldvqa/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import base64
import io
import os

from PIL import Image

from lmms_eval.tasks.worldqa.utils import (
MultiChoiceRegexFilter,
worldq_gen_gpt_eval,
worldqa_aggregate_gen,
worldqa_aggregate_mc,
worldqa_aggregate_mc_eval,
worldqa_aggregate_mc_ppl,
worldqa_doc_to_answer,
worldqa_doc_to_answer_mc,
worldqa_doc_to_answer_mc_ppl,
worldqa_doc_to_choice,
worldqa_doc_to_text,
worldqa_doc_to_visual,
worldqa_process_results,
worldqa_process_results_mc,
)


def worldvqa_doc_to_visual(doc):
if "image" in doc and doc["image"] is not None:
image = doc["image"]
if isinstance(image, Image.Image):
return [image.convert("RGB")]
if isinstance(image, str):
if os.path.exists(image):
return [Image.open(image).convert("RGB")]
decoded = Image.open(io.BytesIO(base64.b64decode(image))).convert("RGB")
return [decoded]
if isinstance(image, dict):
image_path = image.get("path")
if image_path and os.path.exists(image_path):
return [Image.open(image_path).convert("RGB")]
image_bytes = image.get("bytes")
if image_bytes is not None:
return [Image.open(io.BytesIO(image_bytes)).convert("RGB")]

video = doc.get("video")
if isinstance(video, str) and video:
return [video]
if isinstance(video, dict):
video_path = video.get("path")
if video_path:
return [video_path]

try:
return worldqa_doc_to_visual(doc)
except SystemExit:
video_idx = doc.get("video_idx")
if not video_idx:
return []
hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
return [os.path.join(hf_home, "multi-hop-reasoning", "videos", f"{video_idx}.mp4")]


def worldvqa_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if "option" in doc or "video_idx" in doc:
return worldqa_doc_to_text(doc, lmms_eval_specific_kwargs=lmms_eval_specific_kwargs)

if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}

pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
return f"{pre_prompt}{doc['question'].strip()}{post_prompt}"


worldvqa_doc_to_answer = worldqa_doc_to_answer
worldvqa_doc_to_answer_mc = worldqa_doc_to_answer_mc
worldvqa_doc_to_answer_mc_ppl = worldqa_doc_to_answer_mc_ppl
worldvqa_doc_to_choice = worldqa_doc_to_choice
worldvqa_process_results = worldqa_process_results
worldvqa_process_results_mc = worldqa_process_results_mc
worldvqa_aggregate_gen = worldqa_aggregate_gen
worldvqa_aggregate_mc = worldqa_aggregate_mc
worldvqa_aggregate_mc_eval = worldqa_aggregate_mc_eval
worldvqa_aggregate_mc_ppl = worldqa_aggregate_mc_ppl
worldvqa_gen_gpt_eval = worldq_gen_gpt_eval

__all__ = [
"MultiChoiceRegexFilter",
"worldvqa_doc_to_visual",
"worldvqa_doc_to_text",
"worldvqa_doc_to_answer",
"worldvqa_doc_to_answer_mc",
"worldvqa_doc_to_answer_mc_ppl",
"worldvqa_doc_to_choice",
"worldvqa_process_results",
"worldvqa_process_results_mc",
"worldvqa_aggregate_gen",
"worldvqa_aggregate_mc",
"worldvqa_aggregate_mc_eval",
"worldvqa_aggregate_mc_ppl",
"worldvqa_gen_gpt_eval",
]
27 changes: 27 additions & 0 deletions lmms_eval/tasks/worldvqa/worldvqa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
dataset_path: moonshotai/WorldVQA
dataset_kwargs:
token: False
task: "worldvqa"
test_split: train
output_type: generate_until
doc_to_visual: !function utils.worldvqa_doc_to_visual
doc_to_text: !function utils.worldvqa_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 64
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer briefly."
metadata:
version: 0.0
20 changes: 20 additions & 0 deletions lmms_eval/tasks/worldvqa/worldvqa_generation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
dataset_name: "Generation"
task: "worldvqa_gen"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.worldvqa_doc_to_visual
doc_to_text: !function utils.worldvqa_doc_to_text
doc_to_target: !function utils.worldvqa_doc_to_answer
process_results: !function utils.worldvqa_process_results
metric_list:
- metric: submission
aggregation: !function utils.worldvqa_aggregate_gen
higher_is_better: true
- metric: gpt_eval
aggregation: !function utils.worldvqa_gen_gpt_eval
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
include: _default_template_yaml
26 changes: 26 additions & 0 deletions lmms_eval/tasks/worldvqa/worldvqa_mc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
dataset_name: "MC"
task: "worldvqa_mc"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.worldvqa_doc_to_visual
doc_to_text: !function utils.worldvqa_doc_to_text
doc_to_target: !function utils.worldvqa_doc_to_answer_mc
process_results: !function utils.worldvqa_process_results_mc
metric_list:
- metric: gpt_eval
aggregation: !function utils.worldvqa_aggregate_mc_eval
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: 0
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"

include: _default_template_yaml
15 changes: 15 additions & 0 deletions lmms_eval/tasks/worldvqa/worldvqa_mcppl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
dataset_name: "MC_PPL"
task: "worldvqa_mc_ppl"
test_split: test
output_type: multiple_choice
doc_to_visual: !function utils.worldvqa_doc_to_visual
doc_to_text: "question"
doc_to_target: !function utils.worldvqa_doc_to_answer_mc_ppl
doc_to_choice: !function utils.worldvqa_doc_to_choice
metric_list:
- metric: acc
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
include: _default_template_yaml