feat: integrate worldvqa benchmark task (#1168)

Luodian · web-flow · commit c3e36752f08c · 2026-02-23T15:30:30.000+08:00
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -333,6 +333,9 @@ python -m lmms_eval --tasks list_with_num
 - [WorldQA](https://zhangyuanhan-ai.github.io/WorldQA/) (worldqa)
   - WorldQA Generation (worldqa_gen)
   - WorldQA Multiple Choice (worldqa_mc)
+- [WorldVQA](https://huggingface.co/datasets/moonshotai/WorldVQA) (worldvqa)
+  - WorldQA Compatibility Generation (worldvqa_gen)
+  - WorldQA Compatibility Multiple Choice (worldvqa_mc)
 - [YouCook2](http://youcook2.eecs.umich.edu/) (youcook2_val)
 
 ### Long Video & Temporal Understanding
diff --git a/lmms_eval/tasks/worldvqa/_default_template_yaml b/lmms_eval/tasks/worldvqa/_default_template_yaml
@@ -0,0 +1,8 @@
+dataset_path: lmms-lab/worldqa
+dataset_kwargs:
+  token: True
+  video: True
+  cache_dir: multi-hop-reasoning
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-4-0613"
diff --git a/lmms_eval/tasks/worldvqa/utils.py b/lmms_eval/tasks/worldvqa/utils.py
@@ -0,0 +1,100 @@
+import base64
+import io
+import os
+
+from PIL import Image
+
+from lmms_eval.tasks.worldqa.utils import (
+    MultiChoiceRegexFilter,
+    worldq_gen_gpt_eval,
+    worldqa_aggregate_gen,
+    worldqa_aggregate_mc,
+    worldqa_aggregate_mc_eval,
+    worldqa_aggregate_mc_ppl,
+    worldqa_doc_to_answer,
+    worldqa_doc_to_answer_mc,
+    worldqa_doc_to_answer_mc_ppl,
+    worldqa_doc_to_choice,
+    worldqa_doc_to_text,
+    worldqa_doc_to_visual,
+    worldqa_process_results,
+    worldqa_process_results_mc,
+)
+
+
+def worldvqa_doc_to_visual(doc):
+    if "image" in doc and doc["image"] is not None:
+        image = doc["image"]
+        if isinstance(image, Image.Image):
+            return [image.convert("RGB")]
+        if isinstance(image, str):
+            if os.path.exists(image):
+                return [Image.open(image).convert("RGB")]
+            decoded = Image.open(io.BytesIO(base64.b64decode(image))).convert("RGB")
+            return [decoded]
+        if isinstance(image, dict):
+            image_path = image.get("path")
+            if image_path and os.path.exists(image_path):
+                return [Image.open(image_path).convert("RGB")]
+            image_bytes = image.get("bytes")
+            if image_bytes is not None:
+                return [Image.open(io.BytesIO(image_bytes)).convert("RGB")]
+
+    video = doc.get("video")
+    if isinstance(video, str) and video:
+        return [video]
+    if isinstance(video, dict):
+        video_path = video.get("path")
+        if video_path:
+            return [video_path]
+
+    try:
+        return worldqa_doc_to_visual(doc)
+    except SystemExit:
+        video_idx = doc.get("video_idx")
+        if not video_idx:
+            return []
+        hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
+        return [os.path.join(hf_home, "multi-hop-reasoning", "videos", f"{video_idx}.mp4")]
+
+
+def worldvqa_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if "option" in doc or "video_idx" in doc:
+        return worldqa_doc_to_text(doc, lmms_eval_specific_kwargs=lmms_eval_specific_kwargs)
+
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    return f"{pre_prompt}{doc['question'].strip()}{post_prompt}"
+
+
+worldvqa_doc_to_answer = worldqa_doc_to_answer
+worldvqa_doc_to_answer_mc = worldqa_doc_to_answer_mc
+worldvqa_doc_to_answer_mc_ppl = worldqa_doc_to_answer_mc_ppl
+worldvqa_doc_to_choice = worldqa_doc_to_choice
+worldvqa_process_results = worldqa_process_results
+worldvqa_process_results_mc = worldqa_process_results_mc
+worldvqa_aggregate_gen = worldqa_aggregate_gen
+worldvqa_aggregate_mc = worldqa_aggregate_mc
+worldvqa_aggregate_mc_eval = worldqa_aggregate_mc_eval
+worldvqa_aggregate_mc_ppl = worldqa_aggregate_mc_ppl
+worldvqa_gen_gpt_eval = worldq_gen_gpt_eval
+
+__all__ = [
+    "MultiChoiceRegexFilter",
+    "worldvqa_doc_to_visual",
+    "worldvqa_doc_to_text",
+    "worldvqa_doc_to_answer",
+    "worldvqa_doc_to_answer_mc",
+    "worldvqa_doc_to_answer_mc_ppl",
+    "worldvqa_doc_to_choice",
+    "worldvqa_process_results",
+    "worldvqa_process_results_mc",
+    "worldvqa_aggregate_gen",
+    "worldvqa_aggregate_mc",
+    "worldvqa_aggregate_mc_eval",
+    "worldvqa_aggregate_mc_ppl",
+    "worldvqa_gen_gpt_eval",
+]
diff --git a/lmms_eval/tasks/worldvqa/worldvqa.yaml b/lmms_eval/tasks/worldvqa/worldvqa.yaml
@@ -0,0 +1,27 @@
+dataset_path: moonshotai/WorldVQA
+dataset_kwargs:
+  token: False
+task: "worldvqa"
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.worldvqa_doc_to_visual
+doc_to_text: !function utils.worldvqa_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer briefly."
+metadata:
+  version: 0.0
diff --git a/lmms_eval/tasks/worldvqa/worldvqa_generation.yaml b/lmms_eval/tasks/worldvqa/worldvqa_generation.yaml
@@ -0,0 +1,20 @@
+dataset_name: "Generation"
+task: "worldvqa_gen"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.worldvqa_doc_to_visual
+doc_to_text: !function utils.worldvqa_doc_to_text
+doc_to_target: !function utils.worldvqa_doc_to_answer
+process_results: !function utils.worldvqa_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.worldvqa_aggregate_gen
+    higher_is_better: true
+  - metric: gpt_eval
+    aggregation: !function utils.worldvqa_gen_gpt_eval
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/worldvqa/worldvqa_mc.yaml b/lmms_eval/tasks/worldvqa/worldvqa_mc.yaml
@@ -0,0 +1,26 @@
+dataset_name: "MC"
+task: "worldvqa_mc"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.worldvqa_doc_to_visual
+doc_to_text: !function utils.worldvqa_doc_to_text
+doc_to_target: !function utils.worldvqa_doc_to_answer_mc
+process_results: !function utils.worldvqa_process_results_mc
+metric_list:
+  - metric: gpt_eval
+    aggregation: !function utils.worldvqa_aggregate_mc_eval
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/worldvqa/worldvqa_mcppl.yaml b/lmms_eval/tasks/worldvqa/worldvqa_mcppl.yaml
@@ -0,0 +1,15 @@
+dataset_name: "MC_PPL"
+task: "worldvqa_mc_ppl"
+test_split: test
+output_type: multiple_choice
+doc_to_visual: !function utils.worldvqa_doc_to_visual
+doc_to_text: "question"
+doc_to_target: !function utils.worldvqa_doc_to_answer_mc_ppl
+doc_to_choice: !function utils.worldvqa_doc_to_choice
+metric_list:
+  - metric: acc
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+include: _default_template_yaml