Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ python -m lmms_eval --tasks list_with_num
- egoschema_subset_mcppl
- egoschema_subset
- [EgoPlan](https://github.com/ChenYi99/EgoPlan) (egoplan)
- [EgoTempo](https://github.com/google-research-datasets/egotempo) (egotempo)
- [EgoThink](https://github.com/AdaCheng/EgoThink) (egothink)
- [MLVU](https://github.com/JUNJIE99/MLVU) (mlvu)
- [MMT-Bench](https://mmt-bench.github.io/) (mmt)
Expand Down
29 changes: 29 additions & 0 deletions lmms_eval/tasks/egotempo/egotempo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
dataset_path: json
dataset_kwargs:
data_files:
test: https://raw.githubusercontent.com/google-research-datasets/egotempo/main/egotempo_openQA.json
field: annotations
task: egotempo
test_split: test
output_type: generate_until
doc_to_visual: !function utils.egotempo_doc_to_visual
doc_to_text: !function utils.egotempo_doc_to_text
doc_to_target: !function utils.egotempo_doc_to_target
generation_kwargs:
max_new_tokens: 64
temperature: 0
do_sample: false
process_results: !function utils.egotempo_process_results
metric_list:
- metric: egotempo_anls
aggregation: !function utils.egotempo_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with a short phrase."
qwen_vl:
pre_prompt: ""
post_prompt: " Answer:"
metadata:
- version: 0.0
166 changes: 166 additions & 0 deletions lmms_eval/tasks/egotempo/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import os
import re
from collections import defaultdict
from pathlib import Path
from typing import Any

from lmms_eval.utils import eval_logger

_VIDEO_EXTENSIONS = ("mp4", "MP4", "mkv", "webm", "mov")


def _normalize_text(text: Any) -> str:
if text is None:
return ""
normalized = str(text).strip().lower()
normalized = re.sub(r"\s+", " ", normalized)
return normalized


def _levenshtein_distance(left: str, right: str) -> int:
if left == right:
return 0
if not left:
return len(right)
if not right:
return len(left)

if len(left) > len(right):
left, right = right, left

previous = list(range(len(left) + 1))
for i, right_ch in enumerate(right, start=1):
current = [i]
for j, left_ch in enumerate(left, start=1):
insertion = previous[j] + 1
deletion = current[j - 1] + 1
substitution = previous[j - 1] + (left_ch != right_ch)
current.append(min(insertion, deletion, substitution))
previous = current
return previous[-1]


def _anls_score(prediction: str, answer: str, threshold: float = 0.5) -> float:
pred = _normalize_text(prediction)
target = _normalize_text(answer)
if not pred and not target:
return 1.0
if not pred or not target:
return 0.0

distance = _levenshtein_distance(pred, target)
normalized_distance = distance / max(len(pred), len(target))
score = 1.0 - normalized_distance
if score < threshold:
return 0.0
return score


def _strip_answer_prefix(text: str) -> str:
cleaned = str(text).strip()
prefixes = [
"the answer is",
"answer:",
"the correct answer is",
"the final answer is",
]

lowered = cleaned.lower()
for prefix in prefixes:
if lowered.startswith(prefix):
cleaned = cleaned[len(prefix) :].strip(" :.-")
break
return cleaned


def _candidate_video_dirs() -> list[Path]:
paths = []

explicit_video_dir = os.getenv("EGOTEMPO_VIDEO_DIR", "").strip()
if explicit_video_dir:
paths.append(Path(os.path.expanduser(explicit_video_dir)))

explicit_cache_dir = os.getenv("EGOTEMPO_CACHE_DIR", "").strip()
if explicit_cache_dir:
paths.append(Path(os.path.expanduser(explicit_cache_dir)))

hf_home = Path(os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/")))
paths.append(hf_home / "egotempo")

deduped = []
seen = set()
for path in paths:
key = str(path)
if key in seen:
continue
seen.add(key)
deduped.append(path)
return deduped


def _resolve_video_path(clip_id: str) -> str | None:
if clip_id == "":
return None

for root in _candidate_video_dirs():
for extension in _VIDEO_EXTENSIONS:
candidate = root / f"{clip_id}.{extension}"
if candidate.exists():
return str(candidate)
return None


def egotempo_doc_to_visual(doc):
clip_id = str(doc.get("clip_id", "")).strip()
video_path = _resolve_video_path(clip_id)
if video_path is None:
return []
return [video_path]


def egotempo_doc_to_text(doc, lmms_eval_specific_kwargs=None):
kwargs = lmms_eval_specific_kwargs or {}
pre_prompt = kwargs.get("pre_prompt", "")
post_prompt = kwargs.get("post_prompt", "")
question = str(doc.get("question", "")).strip()
return f"{pre_prompt}{question}{post_prompt}"


def egotempo_doc_to_target(doc):
return str(doc.get("answer", "")).strip()


def egotempo_process_results(doc, results):
prediction = _strip_answer_prefix(results[0] if results else "")
answer = str(doc.get("answer", "")).strip()
score = _anls_score(prediction, answer)

return {
"egotempo_anls": {
"score": score,
"question_type": str(doc.get("question_type", "unknown")),
}
}


def egotempo_aggregate_results(items):
if not items:
return 0.0

total_score = 0.0
by_category = defaultdict(list)

for item in items:
score = float(item.get("score", 0.0))
category = str(item.get("question_type", "unknown"))
total_score += score
by_category[category].append(score)

for category in sorted(by_category):
scores = by_category[category]
category_score = sum(scores) / len(scores)
eval_logger.info("EgoTempo [{}] ANLS: {:.2f}", category, category_score * 100)

overall = total_score / len(items)
eval_logger.info("EgoTempo overall ANLS: {:.2f}", overall * 100)
return overall