EvolvingLMMs-Lab · Luodian · Feb 23, 2026 · Feb 22, 2026
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -262,6 +262,7 @@ python -m lmms_eval --tasks list_with_num
   - egoschema_subset_mcppl
   - egoschema_subset
 - [EgoPlan](https://github.com/ChenYi99/EgoPlan) (egoplan)
+- [EgoTempo](https://github.com/google-research-datasets/egotempo) (egotempo)
 - [EgoThink](https://github.com/AdaCheng/EgoThink) (egothink)
 - [MLVU](https://github.com/JUNJIE99/MLVU) (mlvu)
 - [MMT-Bench](https://mmt-bench.github.io/) (mmt)

diff --git a/lmms_eval/tasks/egotempo/egotempo.yaml b/lmms_eval/tasks/egotempo/egotempo.yaml
@@ -0,0 +1,29 @@
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: https://raw.githubusercontent.com/google-research-datasets/egotempo/main/egotempo_openQA.json
+  field: annotations
+task: egotempo
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.egotempo_doc_to_visual
+doc_to_text: !function utils.egotempo_doc_to_text
+doc_to_target: !function utils.egotempo_doc_to_target
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  do_sample: false
+process_results: !function utils.egotempo_process_results
+metric_list:
+  - metric: egotempo_anls
+    aggregation: !function utils.egotempo_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with a short phrase."
+  qwen_vl:
+    pre_prompt: ""
+    post_prompt: " Answer:"
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/egotempo/utils.py b/lmms_eval/tasks/egotempo/utils.py
@@ -0,0 +1,166 @@
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+from lmms_eval.utils import eval_logger
+
+_VIDEO_EXTENSIONS = ("mp4", "MP4", "mkv", "webm", "mov")
+
+
+def _normalize_text(text: Any) -> str:
+    if text is None:
+        return ""
+    normalized = str(text).strip().lower()
+    normalized = re.sub(r"\s+", " ", normalized)
+    return normalized
+
+
+def _levenshtein_distance(left: str, right: str) -> int:
+    if left == right:
+        return 0
+    if not left:
+        return len(right)
+    if not right:
+        return len(left)
+
+    if len(left) > len(right):
+        left, right = right, left
+
+    previous = list(range(len(left) + 1))
+    for i, right_ch in enumerate(right, start=1):
+        current = [i]
+        for j, left_ch in enumerate(left, start=1):
+            insertion = previous[j] + 1
+            deletion = current[j - 1] + 1
+            substitution = previous[j - 1] + (left_ch != right_ch)
+            current.append(min(insertion, deletion, substitution))
+        previous = current
+    return previous[-1]
+
+
+def _anls_score(prediction: str, answer: str, threshold: float = 0.5) -> float:
+    pred = _normalize_text(prediction)
+    target = _normalize_text(answer)
+    if not pred and not target:
+        return 1.0
+    if not pred or not target:
+        return 0.0
+
+    distance = _levenshtein_distance(pred, target)
+    normalized_distance = distance / max(len(pred), len(target))
+    score = 1.0 - normalized_distance
+    if score < threshold:
+        return 0.0
+    return score
+
+
+def _strip_answer_prefix(text: str) -> str:
+    cleaned = str(text).strip()
+    prefixes = [
+        "the answer is",
+        "answer:",
+        "the correct answer is",
+        "the final answer is",
+    ]
+
+    lowered = cleaned.lower()
+    for prefix in prefixes:
+        if lowered.startswith(prefix):
+            cleaned = cleaned[len(prefix) :].strip(" :.-")
+            break
+    return cleaned
+
+
+def _candidate_video_dirs() -> list[Path]:
+    paths = []
+
+    explicit_video_dir = os.getenv("EGOTEMPO_VIDEO_DIR", "").strip()
+    if explicit_video_dir:
+        paths.append(Path(os.path.expanduser(explicit_video_dir)))
+
+    explicit_cache_dir = os.getenv("EGOTEMPO_CACHE_DIR", "").strip()
+    if explicit_cache_dir:
+        paths.append(Path(os.path.expanduser(explicit_cache_dir)))
+
+    hf_home = Path(os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/")))
+    paths.append(hf_home / "egotempo")
+
+    deduped = []
+    seen = set()
+    for path in paths:
+        key = str(path)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(path)
+    return deduped
+
+
+def _resolve_video_path(clip_id: str) -> str | None:
+    if clip_id == "":
+        return None
+
+    for root in _candidate_video_dirs():
+        for extension in _VIDEO_EXTENSIONS:
+            candidate = root / f"{clip_id}.{extension}"
+            if candidate.exists():
+                return str(candidate)
+    return None
+
+
+def egotempo_doc_to_visual(doc):
+    clip_id = str(doc.get("clip_id", "")).strip()
+    video_path = _resolve_video_path(clip_id)
+    if video_path is None:
+        return []
+    return [video_path]
+
+
+def egotempo_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "")
+    question = str(doc.get("question", "")).strip()
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def egotempo_doc_to_target(doc):
+    return str(doc.get("answer", "")).strip()
+
+
+def egotempo_process_results(doc, results):
+    prediction = _strip_answer_prefix(results[0] if results else "")
+    answer = str(doc.get("answer", "")).strip()
+    score = _anls_score(prediction, answer)
+
+    return {
+        "egotempo_anls": {
+            "score": score,
+            "question_type": str(doc.get("question_type", "unknown")),
+        }
+    }
+
+
+def egotempo_aggregate_results(items):
+    if not items:
+        return 0.0
+
+    total_score = 0.0
+    by_category = defaultdict(list)
+
+    for item in items:
+        score = float(item.get("score", 0.0))
+        category = str(item.get("question_type", "unknown"))
+        total_score += score
+        by_category[category].append(score)
+
+    for category in sorted(by_category):
+        scores = by_category[category]
+        category_score = sum(scores) / len(scores)
+        eval_logger.info("EgoTempo [{}] ANLS: {:.2f}", category, category_score * 100)
+
+    overall = total_score / len(items)
+    eval_logger.info("EgoTempo overall ANLS: {:.2f}", overall * 100)
+    return overall