EvolvingLMMs-Lab · Luodian · Feb 23, 2026 · Feb 22, 2026
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -288,6 +288,17 @@ python -m lmms_eval --tasks list_with_num
   - mvbench_fine_grained_action
   - mvbench_moving_attribute
   - mvbench_egocentric_navigation
+- [TVBench](https://huggingface.co/datasets/FunAILab/TVBench) (tvbench)
+  - tvbench_action_antonym
+  - tvbench_action_count
+  - tvbench_action_localization
+  - tvbench_action_sequence
+  - tvbench_egocentric_sequence
+  - tvbench_moving_direction
+  - tvbench_object_count
+  - tvbench_object_shuffle
+  - tvbench_scene_transition
+  - tvbench_unexpected_action
 - [MotionBench](https://motion-bench.github.io/) (motionbench)
   - motionbench_full
 - [NExT-QA](https://github.com/doc-doc/NExT-QA) (nextqa)

diff --git a/lmms_eval/tasks/tvbench/_default_template_yaml b/lmms_eval/tasks/tvbench/_default_template_yaml
@@ -0,0 +1,24 @@
+dataset_path: FunAILab/TVBench
+dataset_kwargs:
+  token: True
+  cache_dir: tvbench
+  video: True
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: false
+output_type: generate_until
+doc_to_visual: !function utils.tvbench_doc_to_visual
+doc_to_text: !function utils.tvbench_doc_to_text
+doc_to_target: !function utils.tvbench_doc_to_target
+process_results: !function utils.tvbench_process_results
+metric_list:
+  - metric: tvbench_acc
+    aggregation: mean
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "Answer with the option letter only."
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/tvbench/tvbench.yaml b/lmms_eval/tasks/tvbench/tvbench.yaml
@@ -0,0 +1,12 @@
+group: tvbench
+task:
+  - tvbench_action_antonym
+  - tvbench_action_count
+  - tvbench_action_localization
+  - tvbench_action_sequence
+  - tvbench_egocentric_sequence
+  - tvbench_moving_direction
+  - tvbench_object_count
+  - tvbench_object_shuffle
+  - tvbench_scene_transition
+  - tvbench_unexpected_action
diff --git a/lmms_eval/tasks/tvbench/tvbench_action_antonym.yaml b/lmms_eval/tasks/tvbench/tvbench_action_antonym.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_action_antonym
+dataset_name: action_antonym
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_action_count.yaml b/lmms_eval/tasks/tvbench/tvbench_action_count.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_action_count
+dataset_name: action_count
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_action_localization.yaml b/lmms_eval/tasks/tvbench/tvbench_action_localization.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_action_localization
+dataset_name: action_localization
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_action_sequence.yaml b/lmms_eval/tasks/tvbench/tvbench_action_sequence.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_action_sequence
+dataset_name: action_sequence
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_egocentric_sequence.yaml b/lmms_eval/tasks/tvbench/tvbench_egocentric_sequence.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_egocentric_sequence
+dataset_name: egocentric_sequence
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_moving_direction.yaml b/lmms_eval/tasks/tvbench/tvbench_moving_direction.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_moving_direction
+dataset_name: moving_direction
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_object_count.yaml b/lmms_eval/tasks/tvbench/tvbench_object_count.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_object_count
+dataset_name: object_count
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_object_shuffle.yaml b/lmms_eval/tasks/tvbench/tvbench_object_shuffle.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_object_shuffle
+dataset_name: object_shuffle
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_scene_transition.yaml b/lmms_eval/tasks/tvbench/tvbench_scene_transition.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_scene_transition
+dataset_name: scene_transition
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/tvbench_unexpected_action.yaml b/lmms_eval/tasks/tvbench/tvbench_unexpected_action.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+task: tvbench_unexpected_action
+dataset_name: unexpected_action
+test_split: train
diff --git a/lmms_eval/tasks/tvbench/utils.py b/lmms_eval/tasks/tvbench/utils.py
@@ -0,0 +1,193 @@
+import os
+import re
+from pathlib import Path
+
+import yaml
+
+_CHOICE_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+_DATASET_NAMES = [
+    "action_antonym",
+    "action_count",
+    "action_localization",
+    "action_sequence",
+    "egocentric_sequence",
+    "moving_direction",
+    "object_count",
+    "object_shuffle",
+    "scene_transition",
+    "unexpected_action",
+]
+
+
+def _safe_get(doc, keys, default=""):
+    for key in keys:
+        value = doc.get(key)
+        if value is not None:
+            return value
+    return default
+
+
+def _normalize_text(text):
+    return " ".join(str(text or "").strip().lower().split())
+
+
+def _extract_candidates(doc):
+    candidates = doc.get("candidates", doc.get("options"))
+    if isinstance(candidates, list):
+        return [str(candidate) for candidate in candidates]
+
+    options = []
+    for index in range(len(_CHOICE_LETTERS)):
+        option_key = f"option{index}"
+        if option_key in doc and doc[option_key] not in (None, ""):
+            options.append(str(doc[option_key]))
+    return options
+
+
+def _resolve_cache_dir():
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
+    template_path = Path(__file__).parent / "_default_template_yaml"
+    with open(template_path, "r", encoding="utf-8") as handle:
+        raw = [line for line in handle.readlines() if "!function" not in line]
+    config = yaml.safe_load("".join(raw)) or {}
+    cache_name = config.get("dataset_kwargs", {}).get("cache_dir", "")
+    if not cache_name:
+        return None
+    return os.path.join(hf_home, str(cache_name))
+
+
+def _candidate_video_paths(video_name):
+    if not _CACHE_DIR:
+        return [video_name]
+
+    relative_paths = [video_name, os.path.join("video", video_name), os.path.join("videos", video_name), os.path.join("data", video_name)]
+    for dataset_name in _DATASET_NAMES:
+        relative_paths.extend(
+            [
+                os.path.join(dataset_name, video_name),
+                os.path.join("video", dataset_name, video_name),
+                os.path.join("videos", dataset_name, video_name),
+            ]
+        )
+
+    candidates = []
+    for rel_path in relative_paths:
+        abs_path = os.path.join(_CACHE_DIR, rel_path)
+        if abs_path not in candidates:
+            candidates.append(abs_path)
+    return candidates
+
+
+def _extract_choice_letter(prediction, candidates):
+    text = str(prediction or "").strip()
+    if not text:
+        return ""
+
+    all_choices = _CHOICE_LETTERS[: max(len(candidates), 2)]
+    uppercase = text.upper()
+
+    letter_match = re.search(r"\b([A-Z])\b", uppercase)
+    if letter_match and letter_match.group(1) in all_choices:
+        return letter_match.group(1)
+
+    prefix_match = re.match(r"^\s*[\(\[]?([A-Z])[\)\].:]?", uppercase)
+    if prefix_match and prefix_match.group(1) in all_choices:
+        return prefix_match.group(1)
+
+    normalized_pred = _normalize_text(text)
+    matched_indices = []
+    for index, candidate in enumerate(candidates):
+        normalized_candidate = _normalize_text(candidate)
+        if normalized_candidate and normalized_candidate in normalized_pred:
+            matched_indices.append(index)
+    if len(matched_indices) == 1:
+        return all_choices[matched_indices[0]]
+
+    return ""
+
+
+def tvbench_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
+    video_value = _safe_get(doc, ["video", "video_path", "video_file"], "")
+    if isinstance(video_value, dict):
+        video_value = _safe_get(video_value, ["path", "video", "filename"], "")
+
+    if isinstance(video_value, list):
+        return [str(video) for video in video_value]
+
+    video_name = str(video_value).strip()
+    if not video_name:
+        return []
+
+    if os.path.isabs(video_name) and os.path.exists(video_name):
+        return [video_name]
+
+    for candidate in _candidate_video_paths(video_name):
+        if os.path.exists(candidate):
+            return [candidate]
+
+    fallback_candidates = _candidate_video_paths(video_name)
+    if fallback_candidates:
+        return [fallback_candidates[0]]
+    return [video_name]
+
+
+def tvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "Answer with the option letter only.")
+
+    question = str(_safe_get(doc, ["question", "prompt", "query"], "")).strip()
+    candidates = _extract_candidates(doc)
+
+    lines = []
+    if question:
+        lines.append(question)
+    for index, candidate in enumerate(candidates):
+        lines.append(f"{_CHOICE_LETTERS[index]}. {candidate}")
+    if post_prompt:
+        lines.append(str(post_prompt).strip())
+
+    text = "\n".join(lines).strip()
+    if pre_prompt:
+        text = f"{pre_prompt}{text}"
+    return text
+
+
+def tvbench_doc_to_target(doc, model_specific_target_kwargs=None):
+    candidates = _extract_candidates(doc)
+    answer = _safe_get(doc, ["answer", "correct_answer", "label", "correct_choice"], "")
+
+    if isinstance(answer, int):
+        if 0 <= answer < len(candidates):
+            return _CHOICE_LETTERS[answer]
+        if 1 <= answer <= len(candidates):
+            return _CHOICE_LETTERS[answer - 1]
+
+    text = str(answer).strip()
+    if len(text) == 1 and text.isalpha():
+        return text.upper()
+
+    if text.isdigit():
+        index = int(text)
+        if 0 <= index < len(candidates):
+            return _CHOICE_LETTERS[index]
+        if 1 <= index <= len(candidates):
+            return _CHOICE_LETTERS[index - 1]
+
+    normalized_answer = _normalize_text(text)
+    for index, candidate in enumerate(candidates):
+        if _normalize_text(candidate) == normalized_answer:
+            return _CHOICE_LETTERS[index]
+
+    return text.upper()
+
+
+def tvbench_process_results(doc, results):
+    candidates = _extract_candidates(doc)
+    prediction = results[0] if results else ""
+    predicted_letter = _extract_choice_letter(prediction, candidates)
+    target_letter = tvbench_doc_to_target(doc)
+    return {"tvbench_acc": 1.0 if predicted_letter == target_letter else 0.0}
+
+
+_CACHE_DIR = _resolve_cache_dir()
diff --git a/test/eval/test_tvbench_task.py b/test/eval/test_tvbench_task.py
@@ -0,0 +1,64 @@
+import unittest
+from unittest.mock import patch
+
+from lmms_eval.tasks import TaskManager
+from lmms_eval.tasks.tvbench import utils
+
+
+class TestTVBenchTaskRegistration(unittest.TestCase):
+    def test_tvbench_group_and_subtasks_are_registered(self):
+        task_manager = TaskManager()
+        expected_subtasks = {
+            "tvbench_action_antonym",
+            "tvbench_action_count",
+            "tvbench_action_localization",
+            "tvbench_action_sequence",
+            "tvbench_egocentric_sequence",
+            "tvbench_moving_direction",
+            "tvbench_object_count",
+            "tvbench_object_shuffle",
+            "tvbench_scene_transition",
+            "tvbench_unexpected_action",
+        }
+
+        self.assertIn("tvbench", task_manager.all_groups)
+        available_tvbench_subtasks = {task for task in task_manager.all_subtasks if task.startswith("tvbench_")}
+        self.assertSetEqual(available_tvbench_subtasks, expected_subtasks)
+
+
+class TestTVBenchUtils(unittest.TestCase):
+    def setUp(self):
+        self.doc = {
+            "question": "What is the person doing?",
+            "candidates": ["Running", "Sitting", "Jumping", "Standing"],
+            "answer": "Sitting",
+            "video": "sample_video.mp4",
+        }
+
+    def test_doc_to_text_formats_options_and_prompt(self):
+        prompt = utils.tvbench_doc_to_text(self.doc)
+        self.assertIn("What is the person doing?", prompt)
+        self.assertIn("A. Running", prompt)
+        self.assertIn("B. Sitting", prompt)
+        self.assertTrue(prompt.endswith("Answer with the option letter only."))
+
+    def test_doc_to_target_maps_answer_to_option_letter(self):
+        self.assertEqual(utils.tvbench_doc_to_target(self.doc), "B")
+
+    def test_process_results_accepts_option_letter(self):
+        result = utils.tvbench_process_results(self.doc, ["B"])
+        self.assertEqual(result["tvbench_acc"], 1.0)
+
+    def test_process_results_accepts_option_text(self):
+        result = utils.tvbench_process_results(self.doc, ["The answer is Sitting."])
+        self.assertEqual(result["tvbench_acc"], 1.0)
+
+    def test_doc_to_visual_returns_resolved_or_fallback_path(self):
+        with patch("lmms_eval.tasks.tvbench.utils.os.path.exists", return_value=False):
+            visual_paths = utils.tvbench_doc_to_visual(self.doc)
+        self.assertEqual(len(visual_paths), 1)
+        self.assertTrue(visual_paths[0].endswith("sample_video.mp4"))
+
+
+if __name__ == "__main__":
+    unittest.main()