egotaskqa: add task port to upstream lmms-eval subtree (#1338)

njb-nvidia · web-flow · commit 531dfa9adec4 · 2026-05-20T13:18:07.000+08:00
Adds `egotaskqa` task that loads MCQ annotations from
`nv-njb/EgoTaskQA-MCQ` on HuggingFace and resolves videos from
`EGOTASKQA_VIDEO_DIR` (defaults to `~/.cache/lmms_eval/egotaskqa/videos/`).
Videos are obtained separately under the upstream EgoTaskQA license.

Verified on Qwen3-VL-2B (500-sample, seed=42): upstream lmms-eval matches
the local fork at 0.964 vs 0.968 with 99.6% identical predictions per doc.
diff --git a/lmms_eval/tasks/egotaskqa/egotaskqa.yaml b/lmms_eval/tasks/egotaskqa/egotaskqa.yaml
@@ -0,0 +1,24 @@
+dataset_path: nv-njb/EgoTaskQA-MCQ
+task: egotaskqa
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.egotaskqa_doc_to_visual
+doc_to_text: !function utils.egotaskqa_doc_to_text
+doc_to_target: "a"
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.egotaskqa_process_results
+metric_list:
+  - metric: egotaskqa_accuracy
+    aggregation: !function utils.egotaskqa_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly and only give the best option. The best answer is: "
+metadata:
+  version: 0.1
diff --git a/lmms_eval/tasks/egotaskqa/utils.py b/lmms_eval/tasks/egotaskqa/utils.py
@@ -0,0 +1,89 @@
+"""Utility functions for the EgoTaskQA-MCQ benchmark.
+
+Annotations are loaded from the ``nv-njb/EgoTaskQA-MCQ`` HuggingFace dataset.
+Videos are *not* redistributed — users must obtain them through the official
+EgoTaskQA license process at https://sites.google.com/view/egotaskqa.
+
+The video directory is resolved in this order:
+
+1. ``EGOTASKQA_VIDEO_DIR`` environment variable, if set.
+2. ``~/.cache/lmms_eval/egotaskqa/videos/`` (default).
+
+Place the downloaded ``qa_videos/*.mp4`` files in that directory — no rename
+is needed; filenames already match the ``video_path`` field.
+"""
+
+import os
+import re
+from pathlib import Path
+
+from loguru import logger as eval_logger
+
+
+def _video_dir() -> str:
+    override = os.environ.get("EGOTASKQA_VIDEO_DIR")
+    if override:
+        return override
+    return str(Path.home() / ".cache" / "lmms_eval" / "egotaskqa" / "videos")
+
+
+def egotaskqa_doc_to_visual(doc):
+    video_path = os.path.join(_video_dir(), doc["video_path"])
+    if not os.path.exists(video_path):
+        eval_logger.warning(
+            f"Video not found: {video_path}. Set EGOTASKQA_VIDEO_DIR or place "
+            f"qa_videos/ under ~/.cache/lmms_eval/egotaskqa/videos/."
+        )
+    return [video_path]
+
+
+def egotaskqa_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+
+    question = (
+        "Select the best answer to the following multiple-choice question "
+        f"based on the video.\n{doc['q']}\nOptions:\n"
+    )
+    options = doc["option"]
+    for letter in sorted(options.keys()):
+        question += f"({letter}) {options[letter]}\n"
+
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def _extract_answer(response, options):
+    letters = sorted(options.keys())
+
+    response = response.replace("answer", "").replace("Answer", "")
+    pred_answer = re.findall(r"[\(\ ]*([A-E])[\)\ ]*", response)
+
+    if pred_answer:
+        pred_letter = pred_answer[0].strip()
+        if pred_letter in letters:
+            return pred_letter
+
+    for letter in letters:
+        opt = options[letter].strip().strip(".")
+        if opt.lower() in response.lower():
+            return letter
+
+    return ""
+
+
+def egotaskqa_process_results(doc, results):
+    pred = results[0]
+    pred_ans = _extract_answer(pred, doc["option"])
+    return {
+        "egotaskqa_accuracy": {
+            "pred_answer": pred_ans,
+            "ground_truth": doc["a"],
+        }
+    }
+
+
+def egotaskqa_aggregate_results(results):
+    correct = sum(1 for r in results if r["pred_answer"] == r["ground_truth"])
+    return correct / len(results) if results else 0