snapshot download

pufanyi · pufanyi · commit cc5267e604e3 · 2026-04-23T13:54:56.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -83,3 +83,4 @@ CLAUDE.md
 .worktrees/
 Bagel/
 MMaDA/
+.codex
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
@@ -1036,8 +1036,11 @@ def _download_from_youtube(path):
                     force_unzip = dataset_kwargs.get("force_unzip", False)
                     revision = dataset_kwargs.get("revision", "main")
                     create_link = dataset_kwargs.get("create_link", False)
-                    # If the user already has a cache dir, we skip download the zip files
-                    if not os.path.exists(cache_dir):
+                    cache_path = None
+                    # If the user already has a cache dir, we skip downloading archives.
+                    # Tasks that set create_link need the snapshot path even when the
+                    # cache dir already exists as a symlink from a previous run.
+                    if not os.path.exists(cache_dir) or (create_link and os.path.islink(cache_dir)):
                         cache_path = snapshot_download(repo_id=self.DATASET_PATH, revision=revision, repo_type="dataset", force_download=force_download, etag_timeout=60)
                         zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
                         tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
@@ -1106,7 +1109,7 @@ def concat_tar_parts(tar_parts, output_tar):
                                 untar_video_data(output_tar)
 
                     # Link cache_path to cache_dir if needed.
-                    if create_link:
+                    if create_link and cache_path is not None:
                         if not os.path.exists(cache_dir) or os.path.islink(cache_dir):
                             if os.path.islink(cache_dir):
                                 os.remove(cache_dir)
diff --git a/lmms_eval/models/chat/fastvideo.py b/lmms_eval/models/chat/fastvideo.py
@@ -70,6 +70,11 @@ def _safe(name: str, default: str = "x") -> str:
     return s[:128]
 
 
+def _default_output_dir() -> str:
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
+    return os.path.join(hf_home, "lmms_eval", "generated_videos", "fastvideo")
+
+
 _DTYPES = {
     "float32": torch.float32,
     "fp32": torch.float32,
@@ -182,9 +187,10 @@ def __init__(
         vae_cpu_offload: bool = True,
         # Misc
         trust_remote_code: bool = True,
-        output_dir: str = "./fastvideo_generated_videos",
+        output_dir: Optional[str] = None,
         batch_size: int = 1,
-        # Resume support: skip samples whose output mp4 already exists.
+        # Artifact reuse: lmms-eval's response cache stores the JSON response,
+        # while VBVR still needs the referenced mp4 on disk.
         overwrite: bool = False,
         **kwargs,
     ):
@@ -206,7 +212,7 @@ def __init__(
         self.seed = seed
         self.negative_prompt = negative_prompt
 
-        self.output_dir = os.path.abspath(os.path.expanduser(output_dir))
+        self.output_dir = os.path.abspath(os.path.expanduser(output_dir or _default_output_dir()))
         os.makedirs(self.output_dir, exist_ok=True)
         self._tmp_img_dir = tempfile.mkdtemp(prefix="fastvideo_inputs_")
 
@@ -522,8 +528,8 @@ def generate_until(self, requests: List[Instance]) -> List[GenerationResult]:
         with ThreadPoolExecutor(max_workers=WORKERS) as executor:
             prepared = list(executor.map(self.make_one_request, requests))
 
-        # Resume: if the target mp4 already exists and is non-empty, reuse it.
-        # Set overwrite=True in model_args to force regeneration.
+        # Reuse generated artifacts when the target mp4 already exists and is
+        # non-empty. Set overwrite=True in model_args to force regeneration.
         presults: List[Optional[GenerationResult]] = [None] * len(prepared)
         skipped_indices: List[int] = []
         if not self.overwrite:
diff --git a/lmms_eval/tasks/vbvr/README.md b/lmms_eval/tasks/vbvr/README.md
@@ -17,18 +17,18 @@ MP4 video; scoring is rule-based and per-task (no LLM judge, no CLIP).
 | `vbvr_in_domain`      | In-Domain_50 only                |
 | `vbvr_out_of_domain`  | Out-of-Domain_50 only            |
 
-## One-time setup
+## Data Cache
 
 The HF dataset card (`Video-Reason/VBVR-Bench-Data`) carries the base64-encoded
 first-frame plus **relative** paths to `ground_truth.mp4`, `first_frame.png`,
-`final_frame.png`, `prompt.txt` etc. The rule-based evaluators read those GT
-files, so you must first download the repo and point `VBVR_GT_PATH` at it:
+`final_frame.png`, `prompt.txt` etc. The task config uses
+`dataset_kwargs.cache_dir: vbvr`, so lmms-eval downloads the dataset snapshot
+and links it under `$HF_HOME/vbvr` by default. The rule-based evaluators resolve
+GT files from that cache path automatically.
 
-```bash
-hf download Video-Reason/VBVR-Bench-Data \
-  --repo-type dataset \
-  --local-dir /data/VBVR-Bench
+If you already have a local checkout, you can still override the GT root with:
 
+```bash
 export VBVR_GT_PATH=/data/VBVR-Bench
 ```
 
@@ -58,16 +58,13 @@ The model must output JSON of the form:
 cd /path/to/lmms-eval; or exit 1
 
 # Rule-based VBVR scorers read the GT mp4s/pngs from this root.
-set -gx VBVR_GT_PATH /path/to/VBVR-Bench
+# By default this is populated automatically at $HF_HOME/vbvr.
+# Uncomment this only if you want to use an existing local checkout.
+# set -gx VBVR_GT_PATH /path/to/VBVR-Bench
 
 set MODEL_DIR   /path/to/Wan2.2-I2V-A14B-Diffusers
-set OUT_ROOT    /path/to/eval_out/vbvr_wan22_full_highres
-set VIDEOS_DIR  $OUT_ROOT/videos
-set METRICS_DIR $OUT_ROOT/metrics
-mkdir -p $VIDEOS_DIR $METRICS_DIR
 
 set MODEL_ARGS "model=$MODEL_DIR"
-set MODEL_ARGS "$MODEL_ARGS,output_dir=$VIDEOS_DIR"
 set MODEL_ARGS "$MODEL_ARGS,data_parallel=4,num_gpus=2,sp_size=2,tp_size=1"
 set MODEL_ARGS "$MODEL_ARGS,num_inference_steps=50,num_frames=81"
 set MODEL_ARGS "$MODEL_ARGS,height=1024,width=1024,fps=16"
@@ -81,12 +78,16 @@ exec stdbuf -oL -eL .venv/bin/python -m lmms_eval eval \
     --tasks vbvr \
     --batch_size 1 \
     --log_samples \
-    --output_path $METRICS_DIR
+    --output_path logs
 ```
 
-Generated videos land in `$VIDEOS_DIR`; per-sample logs and aggregated metrics
-land in `$METRICS_DIR`. Tune `data_parallel`, `num_gpus`, `sp_size`, and the
-`*_cpu_offload` flags to match your hardware.
+Generated videos land in `$HF_HOME/lmms_eval/generated_videos/fastvideo` by
+default. Per-sample logs and aggregated metrics land under `--output_path`, and
+the detailed VBVR evaluation JSON is written through `generate_submission_file()`
+under `--output_path/submissions/`. Add `--use_cache <path>` only if you want
+lmms-eval response caching in addition to FastVideo's generated-mp4 reuse. Tune
+`data_parallel`, `num_gpus`, `sp_size`, and the `*_cpu_offload` flags to match
+your hardware.
 
 ## Metrics
 
diff --git a/lmms_eval/tasks/vbvr/_default_template_yaml b/lmms_eval/tasks/vbvr/_default_template_yaml
@@ -1,4 +1,8 @@
 dataset_path: Video-Reason/VBVR-Bench-Data
+dataset_kwargs:
+  cache_dir: vbvr
+  video: True
+  create_link: True
 test_split: test
 output_type: generate_until
 
diff --git a/lmms_eval/tasks/vbvr/utils.py b/lmms_eval/tasks/vbvr/utils.py
@@ -5,17 +5,17 @@
 1. Each sample carries a first-frame image (base64 PNG) + a text prompt.
 2. The model generates an MP4 and returns JSON: ``{"text": "", "videos": [path]}``.
 3. ``vbvr_process_results`` parses the JSON, resolves the matching ground-truth
-   folder under ``$VBVR_GT_PATH``, dispatches to the per-task rule-based
-   evaluator from the vendored ``vbvr_bench`` package, and records a per-sample
-   score + dimension breakdown.
+   folder from the configured cache or ``$VBVR_GT_PATH``, dispatches to the
+   per-task rule-based evaluator from the vendored ``vbvr_bench`` package, and
+   records a per-sample score + dimension breakdown.
 4. Aggregation functions compute In-Domain / Out-of-Domain / per-category means
    and an overall mean matching the upstream VBVRBench output.
 
 Environment variables
 ---------------------
-- ``VBVR_GT_PATH``: local root of the downloaded Video-Reason/VBVR-Bench-Data
-  dataset. Must contain ``In-Domain_50/`` and ``Out-of-Domain_50/`` folders with
-  ``{task_name}/{video_idx}/{first_frame.png,final_frame.png,ground_truth.mp4,prompt.txt}``.
+- ``VBVR_GT_PATH``: optional local root of the downloaded
+  Video-Reason/VBVR-Bench-Data dataset. If unset, the task uses the configured
+  lmms-eval/Hugging Face cache directory and falls back to ``snapshot_download``.
 """
 
 from __future__ import annotations
@@ -26,12 +26,18 @@
 import os
 import re
 from collections import defaultdict
+from functools import lru_cache
+from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
 
 import numpy as np
+import yaml
+from huggingface_hub import snapshot_download
 from loguru import logger as eval_logger
 from PIL import Image
 
+from lmms_eval import utils as lmms_utils
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 from lmms_eval.tasks.vbvr.vbvr_bench.evaluators import (
     get_evaluator,
     get_split,
@@ -44,11 +50,43 @@
 _BASE64_PREFIX = re.compile(r"^data:image/[^;]+;base64,", re.IGNORECASE)
 
 
+@lru_cache(maxsize=1)
+def _task_config() -> Dict[str, Any]:
+    with open(Path(__file__).parent / "_default_template_yaml", "r", encoding="utf-8") as f:
+        safe_data = [line for line in f if "!function" not in line]
+    return yaml.safe_load("".join(safe_data)) or {}
+
+
+def _dataset_repo_id() -> str:
+    return str(_task_config()["dataset_path"])
+
+
+def _cache_dir_name() -> str:
+    return str(_task_config()["dataset_kwargs"]["cache_dir"])
+
+
+def _looks_like_vbvr_root(root: str) -> bool:
+    return all(os.path.isdir(os.path.join(root, split)) for split in ("In-Domain_50", "Out-of-Domain_50"))
+
+
+@lru_cache(maxsize=1)
 def _gt_root() -> str:
     root = os.getenv("VBVR_GT_PATH")
-    if not root:
-        raise RuntimeError("VBVR_GT_PATH is not set. Download the GT with:\n" "    hf download Video-Reason/VBVR-Bench-Data --repo-type dataset --local-dir <path>\n" "then `export VBVR_GT_PATH=<path>`.")
-    return os.path.expanduser(os.path.expandvars(root))
+    if root:
+        root = os.path.expanduser(os.path.expandvars(root))
+        if _looks_like_vbvr_root(root):
+            return root
+        raise RuntimeError(f"VBVR_GT_PATH does not look like a VBVR-Bench checkout: {root}")
+
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
+    cache_root = lmms_utils.resolve_cache_dir(_cache_dir_name(), base_dir=hf_home)
+    if _looks_like_vbvr_root(cache_root):
+        return cache_root
+
+    snapshot_root = snapshot_download(repo_id=_dataset_repo_id(), repo_type="dataset")
+    if _looks_like_vbvr_root(snapshot_root):
+        return snapshot_root
+    raise RuntimeError(f"Could not locate VBVR GT files in {cache_root} or HF snapshot {snapshot_root}.")
 
 
 def _decode_base64_image(data: str) -> Image.Image:
@@ -169,6 +207,7 @@ def _fanout_metrics(entry: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
         "vbvr_perception": entry,
         "vbvr_spatiality": entry,
         "vbvr_transformation": entry,
+        "submission": entry,
     }
 
 
@@ -237,6 +276,42 @@ def _agg_by(results, key: str, value: str, label: str) -> float:
     return mean
 
 
+def _summary(entries: Sequence[Dict[str, Any]]) -> Dict[str, Any]:
+    scores = [float(e["score"]) for e in entries if isinstance(e.get("score"), (int, float))]
+    summary: Dict[str, Any] = {
+        "overall": _mean(scores),
+        "n": len(scores),
+    }
+    for split in SPLITS:
+        split_scores = [float(e["score"]) for e in entries if e.get("split") == split and isinstance(e.get("score"), (int, float))]
+        summary[split] = {"score": _mean(split_scores), "n": len(split_scores)}
+    for category in CATEGORIES:
+        category_scores = [float(e["score"]) for e in entries if e.get("category") == category and isinstance(e.get("score"), (int, float))]
+        summary[category.lower()] = {"score": _mean(category_scores), "n": len(category_scores)}
+    return summary
+
+
+def _submission_file_name(entries: Sequence[Dict[str, Any]]) -> str:
+    splits = {e.get("split") for e in entries if e.get("split")}
+    if splits == {"In_Domain"}:
+        return "vbvr_in_domain_eval_results.json"
+    if splits == {"Out_of_Domain"}:
+        return "vbvr_out_of_domain_eval_results.json"
+    return "vbvr_eval_results.json"
+
+
+def vbvr_aggregate_submission(results, args) -> None:
+    entries = sorted(_entries(results), key=lambda e: (str(e.get("file_split", "")), str(e.get("task_name", "")), str(e.get("video_idx", ""))))
+    path = generate_submission_file(_submission_file_name(entries), args)
+    payload = {
+        "summary": _summary(entries),
+        "results": entries,
+    }
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2)
+    eval_logger.info(f"[VBVR] Detailed evaluation results saved to {path}")
+
+
 def vbvr_aggregate_overall(results) -> float:
     entries = _entries(results)
     if not entries:
diff --git a/lmms_eval/tasks/vbvr/vbvr.yaml b/lmms_eval/tasks/vbvr/vbvr.yaml
@@ -26,6 +26,9 @@ metric_list:
   - metric: vbvr_transformation
     aggregation: !function utils.vbvr_aggregate_transformation
     higher_is_better: true
+  - metric: submission
+    aggregation: !function utils.vbvr_aggregate_submission
+    higher_is_better: true
 
 metadata:
   - version: 0.1
diff --git a/lmms_eval/tasks/vbvr/vbvr_in_domain.yaml b/lmms_eval/tasks/vbvr/vbvr_in_domain.yaml
@@ -22,6 +22,9 @@ metric_list:
   - metric: vbvr_transformation
     aggregation: !function utils.vbvr_aggregate_transformation
     higher_is_better: true
+  - metric: submission
+    aggregation: !function utils.vbvr_aggregate_submission
+    higher_is_better: true
 
 metadata:
   - version: 0.1
diff --git a/lmms_eval/tasks/vbvr/vbvr_out_of_domain.yaml b/lmms_eval/tasks/vbvr/vbvr_out_of_domain.yaml
@@ -22,6 +22,9 @@ metric_list:
   - metric: vbvr_transformation
     aggregation: !function utils.vbvr_aggregate_transformation
     higher_is_better: true
+  - metric: submission
+    aggregation: !function utils.vbvr_aggregate_submission
+    higher_is_better: true
 
 metadata:
   - version: 0.1

-Original file line number
+Diff line change
 .worktrees/
 Bagel/
 MMaDA/
 +.codex