Skip to content

Commit e818fea

Browse files
authored
feat: backfill VisuLogic benchmark integration (LMM-288) (#1159)
* feat: integrate VisuLogic benchmark task (#1137) * docs: add VisuLogic to current task index
1 parent 8d79eb6 commit e818fea

4 files changed

Lines changed: 117 additions & 0 deletions

File tree

docs/current_tasks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ python -m lmms_eval --tasks list_with_num
7777
- [SeedBench 2](https://github.com/AILab-CVC/SEED-Bench) (seedbench_2)
7878
- [SeedBench 2 Plus](https://huggingface.co/datasets/AILab-CVC/SEED-Bench-2-plus) (seedbench_2_plus)
7979
- [VibeEval](https://github.com/reka-ai/reka-vibe-eval) (vibe_eval)
80+
- [VisuLogic](https://visulogic-benchmark.github.io/VisuLogic/) (visulogic)
8081
- [VizWizVQA](https://vizwiz.org/tasks-and-datasets/vqa/) (vizwiz_vqa)
8182
- VizWizVQA Validation (vizwiz_vqa_val)
8283
- VizWizVQA Test (vizwiz_vqa_test)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
output_type: generate_until
2+
doc_to_visual: !function utils.visulogic_doc_to_visual
3+
doc_to_text: !function utils.visulogic_doc_to_text
4+
doc_to_target: !function utils.visulogic_doc_to_target
5+
generation_kwargs:
6+
max_new_tokens: 64
7+
temperature: 0
8+
do_sample: false
9+
process_results: !function utils.visulogic_process_results
10+
metric_list:
11+
- metric: visulogic_acc
12+
aggregation: !function utils.visulogic_aggregate_acc
13+
higher_is_better: true
14+
lmms_eval_specific_kwargs:
15+
default:
16+
pre_prompt: ""
17+
post_prompt: "\nAnswer with the option letter only."
18+
qwen_vl:
19+
pre_prompt: ""
20+
post_prompt: " Answer:"
21+
metadata:
22+
- version: 0.0

lmms_eval/tasks/visulogic/utils.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import io
2+
import re
3+
import zipfile
4+
from pathlib import Path
5+
6+
from huggingface_hub import snapshot_download
7+
from PIL import Image
8+
9+
DATASET_REPO_ID = "VisuLogic/VisuLogic"
10+
OPTION_LETTERS = {"A", "B", "C", "D"}
11+
12+
_DATASET_DIR = Path(snapshot_download(repo_id=DATASET_REPO_ID, repo_type="dataset", local_dir_use_symlinks=False))
13+
_IMAGES_ARCHIVE_PATH = _DATASET_DIR / "images.zip"
14+
_ANSWER_PATTERNS = [
15+
re.compile(r"<answer>\s*\(?([A-D])\)?\s*</answer>", re.IGNORECASE | re.DOTALL),
16+
re.compile(r"\\boxed\{\s*([A-D])\s*\}", re.IGNORECASE),
17+
re.compile(r"answer\s*(?:is|:|-)\s*\(?([A-D])\)?\b", re.IGNORECASE),
18+
re.compile(r"option\s*([A-D])\b", re.IGNORECASE),
19+
re.compile(r"\(([A-D])\)", re.IGNORECASE),
20+
]
21+
_IMAGES_ARCHIVE = None
22+
23+
24+
def _get_images_archive() -> zipfile.ZipFile:
25+
global _IMAGES_ARCHIVE
26+
if _IMAGES_ARCHIVE is None:
27+
_IMAGES_ARCHIVE = zipfile.ZipFile(_IMAGES_ARCHIVE_PATH, "r")
28+
return _IMAGES_ARCHIVE
29+
30+
31+
def _extract_option_letter(text: str) -> str:
32+
normalized = str(text).strip()
33+
if not normalized:
34+
return ""
35+
36+
for pattern in _ANSWER_PATTERNS:
37+
matches = pattern.findall(normalized)
38+
if matches:
39+
return matches[-1].upper()
40+
41+
if len(normalized) <= 3:
42+
first_char = normalized.upper()[0]
43+
if first_char in OPTION_LETTERS:
44+
return first_char
45+
46+
return ""
47+
48+
49+
def visulogic_doc_to_visual(doc):
50+
image_path = str(doc.get("image_path", "")).strip().lstrip("./")
51+
if not image_path:
52+
return []
53+
54+
archive = _get_images_archive()
55+
try:
56+
with archive.open(image_path) as image_file:
57+
image_bytes = image_file.read()
58+
except KeyError as error:
59+
raise FileNotFoundError(f"Image not found in {DATASET_REPO_ID} archive: {image_path}") from error
60+
61+
return [Image.open(io.BytesIO(image_bytes)).convert("RGB")]
62+
63+
64+
def visulogic_doc_to_text(doc, lmms_eval_specific_kwargs=None):
65+
kwargs = lmms_eval_specific_kwargs or {}
66+
pre_prompt = kwargs.get("pre_prompt", "")
67+
post_prompt = kwargs.get("post_prompt", "")
68+
69+
question = str(doc.get("question", "")).strip()
70+
return f"{pre_prompt}{question}{post_prompt}"
71+
72+
73+
def visulogic_doc_to_target(doc):
74+
return str(doc.get("label", "")).strip().upper()[:1]
75+
76+
77+
def visulogic_process_results(doc, results):
78+
prediction = str(results[0]).strip() if results else ""
79+
predicted_letter = _extract_option_letter(prediction)
80+
target = visulogic_doc_to_target(doc)
81+
score = 1.0 if predicted_letter == target else 0.0
82+
return {"visulogic_acc": score}
83+
84+
85+
def visulogic_aggregate_acc(items):
86+
if not items:
87+
return 0.0
88+
return sum(float(item) for item in items) / len(items)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
dataset_path: VisuLogic/VisuLogic
2+
dataset_kwargs:
3+
data_files: data.jsonl
4+
task: visulogic
5+
test_split: train
6+
include: _default_template_yaml

0 commit comments

Comments
 (0)