Skip to content

Commit 2a9cd80

Browse files
Add video holmes and perceptioncomp
1 parent 52c5620 commit 2a9cd80

9 files changed

Lines changed: 741 additions & 0 deletions

File tree

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
dataset_path: hrinnnn/PerceptionComp
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: perceptioncomp
5+
video: True
6+
test_split: test
7+
output_type: generate_until
8+
doc_to_visual: !function utils.perceptioncomp_doc_to_visual
9+
doc_to_text: !function utils.perceptioncomp_doc_to_text
10+
doc_to_messages: !function utils.perceptioncomp_doc_to_messages
11+
doc_to_target: "answer"
12+
generation_kwargs:
13+
max_new_tokens: 64
14+
temperature: 0
15+
top_p: 1.0
16+
do_sample: false
17+
process_results: !function utils.perceptioncomp_process_results
18+
metric_list:
19+
- metric: perceptioncomp_accuracy
20+
aggregation: !function utils.perceptioncomp_aggregate_accuracy
21+
higher_is_better: true
22+
- metric: perceptioncomp_category_outdoor_tour
23+
aggregation: !function utils.perceptioncomp_aggregate_category_outdoor_tour
24+
higher_is_better: true
25+
- metric: perceptioncomp_category_shopping
26+
aggregation: !function utils.perceptioncomp_aggregate_category_shopping
27+
higher_is_better: true
28+
- metric: perceptioncomp_category_sport
29+
aggregation: !function utils.perceptioncomp_aggregate_category_sport
30+
higher_is_better: true
31+
- metric: perceptioncomp_category_variety_show
32+
aggregation: !function utils.perceptioncomp_aggregate_category_variety_show
33+
higher_is_better: true
34+
- metric: perceptioncomp_category_home_tour
35+
aggregation: !function utils.perceptioncomp_aggregate_category_home_tour
36+
higher_is_better: true
37+
- metric: perceptioncomp_category_game
38+
aggregation: !function utils.perceptioncomp_aggregate_category_game
39+
higher_is_better: true
40+
- metric: perceptioncomp_category_movie
41+
aggregation: !function utils.perceptioncomp_aggregate_category_movie
42+
higher_is_better: true
43+
- metric: perceptioncomp_difficulty_1
44+
aggregation: !function utils.perceptioncomp_aggregate_difficulty_1
45+
higher_is_better: true
46+
- metric: perceptioncomp_difficulty_2
47+
aggregation: !function utils.perceptioncomp_aggregate_difficulty_2
48+
higher_is_better: true
49+
- metric: perceptioncomp_difficulty_3
50+
aggregation: !function utils.perceptioncomp_aggregate_difficulty_3
51+
higher_is_better: true
52+
lmms_eval_specific_kwargs:
53+
default:
54+
pre_prompt: ""
55+
post_prompt: "\nAnswer with the option's letter from the given choices directly."
56+
qwen3_vl:
57+
format: "qwen3_vl"
58+
pre_prompt: "Question: "
59+
post_prompt: "Answer with the option letter only."
60+
metadata:
61+
- version: 0.0
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
task: perceptioncomp
2+
include: _default_template_yaml
3+
doc_to_text: !function utils.perceptioncomp_doc_to_text
4+
lmms_eval_specific_kwargs:
5+
default:
6+
pre_prompt: ""
7+
post_prompt: "\nAnswer with the option's letter from the given choices directly."
8+
qwen3_vl:
9+
format: "qwen3_vl"
10+
pre_prompt: "Question: "
11+
post_prompt: "Answer with the option letter only."
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
task: perceptioncomp_reasoning
2+
include: _default_template_yaml
3+
doc_to_text: !function utils.perceptioncomp_doc_to_text_reasoning
4+
generation_kwargs:
5+
max_new_tokens: 4096
6+
temperature: 0
7+
top_p: 1.0
8+
do_sample: false
Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
"""PerceptionComp: A perception-centric video benchmark.
2+
3+
Evaluates VLMs on 279 videos with 1,114 five-choice MCQ questions (A-E)
4+
across 7 categories and 3 difficulty levels.
5+
6+
Reference: https://arxiv.org/abs/2603.26653
7+
Dataset: https://huggingface.co/datasets/hrinnnn/PerceptionComp
8+
9+
NOTE: Some videos (e.g. Monaco_21m-25m.mp4) cause decord to segfault,
10+
killing the process with no traceback (SIGKILL, exit -9). To avoid this,
11+
force the torchvision backend before running:
12+
export FORCE_QWENVL_VIDEO_READER=torchvision
13+
This must be set via `export` so child processes (e.g. accelerate launch)
14+
inherit it.
15+
"""
16+
17+
import os
18+
import random
19+
import re
20+
import sys
21+
from functools import lru_cache
22+
from pathlib import Path
23+
24+
import numpy as np
25+
import yaml
26+
from loguru import logger as eval_logger
27+
28+
DATASET_REPO_ID = "hrinnnn/PerceptionComp"
29+
30+
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
31+
base_cache_dir = os.path.expanduser(hf_home)
32+
33+
with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
34+
raw_data = f.readlines()
35+
safe_data = []
36+
for line in raw_data:
37+
if "!function" not in line:
38+
safe_data.append(line)
39+
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
40+
41+
42+
@lru_cache(maxsize=1)
43+
def _get_video_dir():
44+
"""Resolve the video data directory from the HF hub cache."""
45+
from huggingface_hub import snapshot_download
46+
47+
snapshot_path = snapshot_download(DATASET_REPO_ID, repo_type="dataset", cache_dir=base_cache_dir)
48+
return os.path.join(snapshot_path, "data")
49+
50+
CATEGORIES = [
51+
"outdoor tour",
52+
"shopping",
53+
"sport",
54+
"variety show",
55+
"home tour",
56+
"game",
57+
"movie",
58+
]
59+
60+
DIFFICULTY_LEVELS = [1, 2, 3]
61+
62+
63+
# ──────────────────────────────────────────────
64+
# doc_to_visual
65+
# ──────────────────────────────────────────────
66+
67+
68+
def perceptioncomp_doc_to_visual(doc):
69+
video_dir = _get_video_dir()
70+
video_id = doc["video_id"]
71+
for ext in ["mp4", "MP4", "mkv", "webm"]:
72+
video_path = os.path.join(video_dir, f"{video_id}.{ext}")
73+
if os.path.exists(video_path):
74+
return [video_path]
75+
eval_logger.warning(f"[perceptioncomp] Video not found: {video_id}. Continuing with text-only fallback.")
76+
return []
77+
78+
79+
# ──────────────────────────────────────────────
80+
# doc_to_text
81+
# ──────────────────────────────────────────────
82+
83+
84+
def _build_options(doc):
85+
"""Build list of (label, text) tuples, skipping empty trailing options."""
86+
labels = ["A", "B", "C", "D", "E", "F"]
87+
options = []
88+
for i, label in enumerate(labels):
89+
choice = doc.get(f"answer_choice_{i}", "")
90+
if choice is not None and str(choice).strip():
91+
options.append((label, str(choice).strip()))
92+
return options
93+
94+
95+
def _build_options_str(doc):
96+
return "\n".join(f"{label}. {text}" for label, text in _build_options(doc))
97+
98+
99+
def perceptioncomp_doc_to_text(doc, lmms_eval_specific_kwargs=None):
100+
if lmms_eval_specific_kwargs and lmms_eval_specific_kwargs.get("format") == "qwen3_vl":
101+
return _doc_to_text_qwen3vl(doc, lmms_eval_specific_kwargs)
102+
103+
question = doc["question"]
104+
options = _build_options_str(doc)
105+
instruct_prompt = (
106+
"Select the best answer to the following multiple-choice "
107+
"question based on the video. Respond with only the letter "
108+
"(A, B, C, D, or E) of the correct option."
109+
)
110+
return f"Question: {question}\n{options}\n{instruct_prompt}"
111+
112+
113+
def _doc_to_text_qwen3vl(doc, lmms_eval_specific_kwargs=None):
114+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") if lmms_eval_specific_kwargs else ""
115+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") if lmms_eval_specific_kwargs else ""
116+
question = doc["question"]
117+
options = _build_options_str(doc)
118+
return f"{pre_prompt}{question}\n{options}\n{post_prompt}"
119+
120+
121+
def perceptioncomp_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
122+
"""Structured chat messages for chat models (recommended)."""
123+
prompt = perceptioncomp_doc_to_text(doc, lmms_eval_specific_kwargs)
124+
content = []
125+
for video_path in perceptioncomp_doc_to_visual(doc):
126+
content.append({"type": "video", "url": video_path})
127+
content.append({"type": "text", "text": prompt})
128+
return [{"role": "user", "content": content}]
129+
130+
131+
def perceptioncomp_doc_to_text_reasoning(doc, lmms_eval_specific_kwargs=None):
132+
reasoning_prompt = (
133+
"Please perform a detailed reasoning based on the provided video frames to answer the following "
134+
"multiple-choice question selecting the best option from A through E and providing your final response "
135+
"strictly in the format: 'Final Answer: <letter>'."
136+
)
137+
question = doc["question"]
138+
options = _build_options_str(doc)
139+
return f"Question: {question}\n{options}\n{reasoning_prompt}"
140+
141+
142+
# ──────────────────────────────────────────────
143+
# Answer extraction
144+
# ──────────────────────────────────────────────
145+
146+
147+
def parse_multi_choice_response(response, all_choices, index2ans):
148+
"""
149+
Parse the prediction from the generated response.
150+
Return the predicted index e.g., A, B, C, D.
151+
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
152+
"""
153+
for char in [",", ".", "!", "?", ";", ":", "'"]:
154+
response = response.strip(char)
155+
response = " " + response + " " # add space to avoid partial match
156+
157+
index_ans = True
158+
ans_with_brack = False
159+
candidates = []
160+
for choice in all_choices: # e.g., (A) (B) (C) (D)
161+
if f"({choice})" in response:
162+
candidates.append(choice)
163+
ans_with_brack = True
164+
165+
if len(candidates) == 0:
166+
for choice in all_choices: # e.g., A B C D
167+
if f"{choice} " in response:
168+
candidates.append(choice)
169+
170+
if len(candidates) == 0:
171+
for choice in all_choices: # e.g., A. B. C. D.
172+
if f"{choice}." in response:
173+
candidates.append(choice)
174+
175+
# if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
176+
if len(candidates) == 0 and len(response.split()) > 5:
177+
for index, ans in index2ans.items():
178+
if ans.lower() in response.lower():
179+
candidates.append(index)
180+
index_ans = False # it's content ans.
181+
182+
if len(candidates) == 0: # still not get answer, randomly choose one.
183+
pred_index = random.choice(all_choices)
184+
elif len(candidates) > 1:
185+
start_indexes = []
186+
if index_ans:
187+
if ans_with_brack:
188+
for can in candidates:
189+
index = response.rfind(f"({can})")
190+
start_indexes.append(index) # -1 will be ignored anyway
191+
# start_indexes = [generated_response.index(f'({can})') for can in candidates]
192+
else:
193+
for can in candidates:
194+
index = response.rfind(f" {can} ")
195+
start_indexes.append(index)
196+
else:
197+
for can in candidates:
198+
index = response.lower().rfind(index2ans[can].lower())
199+
start_indexes.append(index)
200+
# get the last one
201+
pred_index = candidates[np.argmax(start_indexes)]
202+
else: # if only one candidate, use it.
203+
pred_index = candidates[0]
204+
205+
return pred_index
206+
207+
208+
# ──────────────────────────────────────────────
209+
# process_results
210+
# ──────────────────────────────────────────────
211+
212+
213+
def perceptioncomp_process_results(doc, results):
214+
pred = results[0]
215+
216+
options = _build_options(doc)
217+
all_choices = [label for label, _ in options]
218+
index2ans = {label: text for label, text in options}
219+
220+
pred_ans = parse_multi_choice_response(pred, all_choices, index2ans)
221+
gt_ans = chr(65 + int(doc["answer_id"]))
222+
score = 1 if pred_ans.upper() == gt_ans.upper() else 0
223+
224+
data_dict = {
225+
"score": score,
226+
"category": doc.get("category", ""),
227+
"difficulty": doc.get("difficulty", ""),
228+
}
229+
230+
result = {"perceptioncomp_accuracy": data_dict}
231+
for cat in CATEGORIES:
232+
key = "perceptioncomp_category_" + cat.replace(" ", "_")
233+
result[key] = data_dict
234+
for level in DIFFICULTY_LEVELS:
235+
result[f"perceptioncomp_difficulty_{level}"] = data_dict
236+
return result
237+
238+
239+
# ──────────────────────────────────────────────
240+
# Aggregation helpers
241+
# ──────────────────────────────────────────────
242+
243+
244+
def perceptioncomp_aggregate_accuracy(results):
245+
total = len(results)
246+
if total == 0:
247+
return 0.0
248+
correct = sum(r["score"] for r in results)
249+
acc = correct / total * 100
250+
eval_logger.info(f"PerceptionComp Overall Accuracy: {acc:.2f}% [{total} samples]")
251+
return acc
252+
253+
254+
def _aggregate_by_category(results, category):
255+
subset = [r for r in results if r["category"] == category]
256+
if not subset:
257+
return 0.0
258+
acc = sum(r["score"] for r in subset) / len(subset) * 100
259+
eval_logger.info(f"PerceptionComp [{category}]: {acc:.2f}% [{len(subset)} samples]")
260+
return acc
261+
262+
263+
def _aggregate_by_difficulty(results, difficulty):
264+
subset = [r for r in results if str(r["difficulty"]) == str(difficulty)]
265+
if not subset:
266+
return 0.0
267+
acc = sum(r["score"] for r in subset) / len(subset) * 100
268+
eval_logger.info(f"PerceptionComp Difficulty {difficulty}: {acc:.2f}% [{len(subset)} samples]")
269+
return acc
270+
271+
272+
# Per-category aggregation functions
273+
def perceptioncomp_aggregate_category_outdoor_tour(results):
274+
return _aggregate_by_category(results, "outdoor tour")
275+
276+
277+
def perceptioncomp_aggregate_category_shopping(results):
278+
return _aggregate_by_category(results, "shopping")
279+
280+
281+
def perceptioncomp_aggregate_category_sport(results):
282+
return _aggregate_by_category(results, "sport")
283+
284+
285+
def perceptioncomp_aggregate_category_variety_show(results):
286+
return _aggregate_by_category(results, "variety show")
287+
288+
289+
def perceptioncomp_aggregate_category_home_tour(results):
290+
return _aggregate_by_category(results, "home tour")
291+
292+
293+
def perceptioncomp_aggregate_category_game(results):
294+
return _aggregate_by_category(results, "game")
295+
296+
297+
def perceptioncomp_aggregate_category_movie(results):
298+
return _aggregate_by_category(results, "movie")
299+
300+
301+
# Per-difficulty aggregation functions
302+
def perceptioncomp_aggregate_difficulty_1(results):
303+
return _aggregate_by_difficulty(results, 1)
304+
305+
306+
def perceptioncomp_aggregate_difficulty_2(results):
307+
return _aggregate_by_difficulty(results, 2)
308+
309+
310+
def perceptioncomp_aggregate_difficulty_3(results):
311+
return _aggregate_by_difficulty(results, 3)

0 commit comments

Comments
 (0)