Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,17 @@ python -m lmms_eval --tasks list_with_num
- mvbench_fine_grained_action
- mvbench_moving_attribute
- mvbench_egocentric_navigation
- [TVBench](https://huggingface.co/datasets/FunAILab/TVBench) (tvbench)
- tvbench_action_antonym
- tvbench_action_count
- tvbench_action_localization
- tvbench_action_sequence
- tvbench_egocentric_sequence
- tvbench_moving_direction
- tvbench_object_count
- tvbench_object_shuffle
- tvbench_scene_transition
- tvbench_unexpected_action
- [MotionBench](https://motion-bench.github.io/) (motionbench)
- motionbench_full
- [NExT-QA](https://github.com/doc-doc/NExT-QA) (nextqa)
Expand Down
24 changes: 24 additions & 0 deletions lmms_eval/tasks/tvbench/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_path: FunAILab/TVBench
dataset_kwargs:
token: True
cache_dir: tvbench
video: True
generation_kwargs:
max_new_tokens: 32
temperature: 0
do_sample: false
output_type: generate_until
doc_to_visual: !function utils.tvbench_doc_to_visual
doc_to_text: !function utils.tvbench_doc_to_text
doc_to_target: !function utils.tvbench_doc_to_target
process_results: !function utils.tvbench_process_results
metric_list:
- metric: tvbench_acc
aggregation: mean
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "Answer with the option letter only."
metadata:
- version: 0.0
12 changes: 12 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
group: tvbench
task:
- tvbench_action_antonym
- tvbench_action_count
- tvbench_action_localization
- tvbench_action_sequence
- tvbench_egocentric_sequence
- tvbench_moving_direction
- tvbench_object_count
- tvbench_object_shuffle
- tvbench_scene_transition
- tvbench_unexpected_action
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_action_antonym.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_action_antonym
dataset_name: action_antonym
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_action_count.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_action_count
dataset_name: action_count
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_action_localization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_action_localization
dataset_name: action_localization
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_action_sequence.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_action_sequence
dataset_name: action_sequence
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_egocentric_sequence.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_egocentric_sequence
dataset_name: egocentric_sequence
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_moving_direction.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_moving_direction
dataset_name: moving_direction
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_object_count.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_object_count
dataset_name: object_count
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_object_shuffle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_object_shuffle
dataset_name: object_shuffle
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_scene_transition.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_scene_transition
dataset_name: scene_transition
test_split: train
4 changes: 4 additions & 0 deletions lmms_eval/tasks/tvbench/tvbench_unexpected_action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml
task: tvbench_unexpected_action
dataset_name: unexpected_action
test_split: train
193 changes: 193 additions & 0 deletions lmms_eval/tasks/tvbench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import os
import re
from pathlib import Path

import yaml

_CHOICE_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
_DATASET_NAMES = [
"action_antonym",
"action_count",
"action_localization",
"action_sequence",
"egocentric_sequence",
"moving_direction",
"object_count",
"object_shuffle",
"scene_transition",
"unexpected_action",
]


def _safe_get(doc, keys, default=""):
for key in keys:
value = doc.get(key)
if value is not None:
return value
return default


def _normalize_text(text):
return " ".join(str(text or "").strip().lower().split())


def _extract_candidates(doc):
candidates = doc.get("candidates", doc.get("options"))
if isinstance(candidates, list):
return [str(candidate) for candidate in candidates]

options = []
for index in range(len(_CHOICE_LETTERS)):
option_key = f"option{index}"
if option_key in doc and doc[option_key] not in (None, ""):
options.append(str(doc[option_key]))
return options


def _resolve_cache_dir():
hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
template_path = Path(__file__).parent / "_default_template_yaml"
with open(template_path, "r", encoding="utf-8") as handle:
raw = [line for line in handle.readlines() if "!function" not in line]
config = yaml.safe_load("".join(raw)) or {}
cache_name = config.get("dataset_kwargs", {}).get("cache_dir", "")
if not cache_name:
return None
return os.path.join(hf_home, str(cache_name))


def _candidate_video_paths(video_name):
if not _CACHE_DIR:
return [video_name]

relative_paths = [video_name, os.path.join("video", video_name), os.path.join("videos", video_name), os.path.join("data", video_name)]
for dataset_name in _DATASET_NAMES:
relative_paths.extend(
[
os.path.join(dataset_name, video_name),
os.path.join("video", dataset_name, video_name),
os.path.join("videos", dataset_name, video_name),
]
)

candidates = []
for rel_path in relative_paths:
abs_path = os.path.join(_CACHE_DIR, rel_path)
if abs_path not in candidates:
candidates.append(abs_path)
return candidates


def _extract_choice_letter(prediction, candidates):
text = str(prediction or "").strip()
if not text:
return ""

all_choices = _CHOICE_LETTERS[: max(len(candidates), 2)]
uppercase = text.upper()

letter_match = re.search(r"\b([A-Z])\b", uppercase)
if letter_match and letter_match.group(1) in all_choices:
return letter_match.group(1)

prefix_match = re.match(r"^\s*[\(\[]?([A-Z])[\)\].:]?", uppercase)
if prefix_match and prefix_match.group(1) in all_choices:
return prefix_match.group(1)

normalized_pred = _normalize_text(text)
matched_indices = []
for index, candidate in enumerate(candidates):
normalized_candidate = _normalize_text(candidate)
if normalized_candidate and normalized_candidate in normalized_pred:
matched_indices.append(index)
if len(matched_indices) == 1:
return all_choices[matched_indices[0]]

return ""


def tvbench_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
video_value = _safe_get(doc, ["video", "video_path", "video_file"], "")
if isinstance(video_value, dict):
video_value = _safe_get(video_value, ["path", "video", "filename"], "")

if isinstance(video_value, list):
return [str(video) for video in video_value]

video_name = str(video_value).strip()
if not video_name:
return []

if os.path.isabs(video_name) and os.path.exists(video_name):
return [video_name]

for candidate in _candidate_video_paths(video_name):
if os.path.exists(candidate):
return [candidate]

fallback_candidates = _candidate_video_paths(video_name)
if fallback_candidates:
return [fallback_candidates[0]]
return [video_name]


def tvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
kwargs = lmms_eval_specific_kwargs or {}
pre_prompt = kwargs.get("pre_prompt", "")
post_prompt = kwargs.get("post_prompt", "Answer with the option letter only.")

question = str(_safe_get(doc, ["question", "prompt", "query"], "")).strip()
candidates = _extract_candidates(doc)

lines = []
if question:
lines.append(question)
for index, candidate in enumerate(candidates):
lines.append(f"{_CHOICE_LETTERS[index]}. {candidate}")
if post_prompt:
lines.append(str(post_prompt).strip())

text = "\n".join(lines).strip()
if pre_prompt:
text = f"{pre_prompt}{text}"
return text


def tvbench_doc_to_target(doc, model_specific_target_kwargs=None):
candidates = _extract_candidates(doc)
answer = _safe_get(doc, ["answer", "correct_answer", "label", "correct_choice"], "")

if isinstance(answer, int):
if 0 <= answer < len(candidates):
return _CHOICE_LETTERS[answer]
if 1 <= answer <= len(candidates):
return _CHOICE_LETTERS[answer - 1]

text = str(answer).strip()
if len(text) == 1 and text.isalpha():
return text.upper()

if text.isdigit():
index = int(text)
if 0 <= index < len(candidates):
return _CHOICE_LETTERS[index]
if 1 <= index <= len(candidates):
return _CHOICE_LETTERS[index - 1]

normalized_answer = _normalize_text(text)
for index, candidate in enumerate(candidates):
if _normalize_text(candidate) == normalized_answer:
return _CHOICE_LETTERS[index]

return text.upper()


def tvbench_process_results(doc, results):
candidates = _extract_candidates(doc)
prediction = results[0] if results else ""
predicted_letter = _extract_choice_letter(prediction, candidates)
target_letter = tvbench_doc_to_target(doc)
return {"tvbench_acc": 1.0 if predicted_letter == target_letter else 0.0}


_CACHE_DIR = _resolve_cache_dir()
64 changes: 64 additions & 0 deletions test/eval/test_tvbench_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import unittest
from unittest.mock import patch

from lmms_eval.tasks import TaskManager
from lmms_eval.tasks.tvbench import utils


class TestTVBenchTaskRegistration(unittest.TestCase):
def test_tvbench_group_and_subtasks_are_registered(self):
task_manager = TaskManager()
expected_subtasks = {
"tvbench_action_antonym",
"tvbench_action_count",
"tvbench_action_localization",
"tvbench_action_sequence",
"tvbench_egocentric_sequence",
"tvbench_moving_direction",
"tvbench_object_count",
"tvbench_object_shuffle",
"tvbench_scene_transition",
"tvbench_unexpected_action",
}

self.assertIn("tvbench", task_manager.all_groups)
available_tvbench_subtasks = {task for task in task_manager.all_subtasks if task.startswith("tvbench_")}
self.assertSetEqual(available_tvbench_subtasks, expected_subtasks)


class TestTVBenchUtils(unittest.TestCase):
def setUp(self):
self.doc = {
"question": "What is the person doing?",
"candidates": ["Running", "Sitting", "Jumping", "Standing"],
"answer": "Sitting",
"video": "sample_video.mp4",
}

def test_doc_to_text_formats_options_and_prompt(self):
prompt = utils.tvbench_doc_to_text(self.doc)
self.assertIn("What is the person doing?", prompt)
self.assertIn("A. Running", prompt)
self.assertIn("B. Sitting", prompt)
self.assertTrue(prompt.endswith("Answer with the option letter only."))

def test_doc_to_target_maps_answer_to_option_letter(self):
self.assertEqual(utils.tvbench_doc_to_target(self.doc), "B")

def test_process_results_accepts_option_letter(self):
result = utils.tvbench_process_results(self.doc, ["B"])
self.assertEqual(result["tvbench_acc"], 1.0)

def test_process_results_accepts_option_text(self):
result = utils.tvbench_process_results(self.doc, ["The answer is Sitting."])
self.assertEqual(result["tvbench_acc"], 1.0)

def test_doc_to_visual_returns_resolved_or_fallback_path(self):
with patch("lmms_eval.tasks.tvbench.utils.os.path.exists", return_value=False):
visual_paths = utils.tvbench_doc_to_visual(self.doc)
self.assertEqual(len(visual_paths), 1)
self.assertTrue(visual_paths[0].endswith("sample_video.mp4"))


if __name__ == "__main__":
unittest.main()