Add WISE Benchmark Task

Purshow · Purshow · commit e06c725f300f · 2026-04-17T20:19:15.000+08:00
diff --git a/lmms_eval/tasks/WISE/README.md b/lmms_eval/tasks/WISE/README.md
@@ -0,0 +1,63 @@
+# WISE
+
+WISE is a knowledge-intensive text-to-image benchmark that evaluates whether models can use commonsense, cultural, scientific, spatial, and temporal knowledge to generate correct images.
+
+- Paper: https://arxiv.org/abs/2503.07265
+- Dataset: https://huggingface.co/datasets/Yuwei-Niu/WISE
+
+## Overview
+
+**Dataset:** 1000 prompts across 6 categories (Culture, Time, Space, Biology, Physics, Chemistry)
+
+**Evaluation:** GPT-4o judges each generated image on three dimensions:
+- **Consistency** (0-2): How well the image matches the prompt
+- **Realism** (0-2): Visual quality and photorealism
+- **Aesthetic Quality** (0-2): Artistic appeal and composition
+
+**WiScore Formula:** `(0.7 × consistency + 0.2 × realism + 0.1 × aesthetic) / 2`
+
+**Final Score:** Weighted average across categories (Culture: 0.4, Time: 0.167, Space: 0.133, Biology/Physics/Chemistry: 0.1 each)
+
+## Environment Variables
+
+```bash
+export WISE_API_KEY="your-api-key"                    # Judge API key
+export WISE_BASE_URL="https://api.openai.com/v1"     # Judge API endpoint
+export WISE_MODEL_NAME="gpt-4o-2024-05-13"                       # Judge model name
+export WISE_RAW_OUTPUT_DIR="/path/to/model/output"   # Where model saves generated images
+```
+
+## Usage
+
+### Full Evaluation with bagel
+
+```bash
+cd /pfs/weiyang/Show/lmms-eval
+
+export WISE_API_KEY="sk-..."
+export WISE_BASE_URL="https://api.bltcy.ai/v1"
+export WISE_MODEL_NAME="gpt-4o"
+export WISE_RAW_OUTPUT_DIR="/pfs/weiyang/Show/lmms-eval/outputs/WISE_raw/bagel_umm"
+
+python -m lmms_eval \
+  --model bagel_umm \
+  --model_args pretrained=/pfs/weiyang/WISE_re/CKPT/ByteDance-Seed/BAGEL-7B-MoT,mode=generate,output_dir=/pfs/weiyang/Show/lmms-eval/outputs/WISE_raw/bagel_umm \
+  --tasks WISE \
+  --batch_size 1 \
+  --log_samples \
+  --output_path /pfs/weiyang/Show/lmms-eval/outputs/WISE_eval/bagel_umm
+```
+
+## Metrics
+
+- `WISE_culture_score`: Culture category score (prompt_id 1-400)
+- `WISE_time_score`: Time category score (prompt_id 401-567)
+- `WISE_space_score`: Space category score (prompt_id 568-700)
+- `WISE_biology_score`: Biology category score (prompt_id 701-800)
+- `WISE_physics_score`: Physics category score (prompt_id 801-900)
+- `WISE_chemistry_score`: Chemistry category score (prompt_id 901-1000)
+- `WISE_overall_wiscore`: Weighted overall score (main metric)
+
+All scores are in the range [0.0, 1.0].
+
+Do not use VQA-only models or image-editing models for this task.
diff --git a/lmms_eval/tasks/WISE/WISE.yaml b/lmms_eval/tasks/WISE/WISE.yaml
@@ -0,0 +1,60 @@
+# WISE text-to-image benchmark.
+#
+# Dataset notes:
+# - The source JSON is expected to be uploaded to Hugging Face as:
+#     Yuwei-Niu/WISE/final_data.json
+# - data_files maps that JSON into the lmms-eval test split without using any
+#   local mount path.
+
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: Yuwei-Niu/WISE
+task: WISE
+test_split: test
+output_type: generate_until
+
+doc_to_visual: !function utils.wise_doc_to_visual
+doc_to_text: !function utils.wise_doc_to_text
+doc_to_target: !function utils.wise_doc_to_target
+
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+process_results: !function utils.wise_process_results
+
+metric_list:
+  - metric: WISE_culture_score
+    aggregation: !function utils.wise_aggregate_culture
+    higher_is_better: true
+  - metric: WISE_time_score
+    aggregation: !function utils.wise_aggregate_time
+    higher_is_better: true
+  - metric: WISE_space_score
+    aggregation: !function utils.wise_aggregate_space
+    higher_is_better: true
+  - metric: WISE_biology_score
+    aggregation: !function utils.wise_aggregate_biology
+    higher_is_better: true
+  - metric: WISE_physics_score
+    aggregation: !function utils.wise_aggregate_physics
+    higher_is_better: true
+  - metric: WISE_chemistry_score
+    aggregation: !function utils.wise_aggregate_chemistry
+    higher_is_better: true
+  - metric: WISE_overall_wiscore
+    aggregation: !function utils.wise_aggregate_overall_wiscore
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+
+metadata:
+  - version: 0.1
+    description: "WISE text-to-image benchmark with OpenAI-compatible GPT image judge"
diff --git a/lmms_eval/tasks/WISE/utils.py b/lmms_eval/tasks/WISE/utils.py