|
5 | 5 | 1. Each sample carries a first-frame image (base64 PNG) + a text prompt. |
6 | 6 | 2. The model generates an MP4 and returns JSON: ``{"text": "", "videos": [path]}``. |
7 | 7 | 3. ``vbvr_process_results`` parses the JSON, resolves the matching ground-truth |
8 | | - folder under ``$VBVR_GT_PATH``, dispatches to the per-task rule-based |
9 | | - evaluator from the vendored ``vbvr_bench`` package, and records a per-sample |
10 | | - score + dimension breakdown. |
| 8 | + folder from the configured cache or ``$VBVR_GT_PATH``, dispatches to the |
| 9 | + per-task rule-based evaluator from the vendored ``vbvr_bench`` package, and |
| 10 | + records a per-sample score + dimension breakdown. |
11 | 11 | 4. Aggregation functions compute In-Domain / Out-of-Domain / per-category means |
12 | 12 | and an overall mean matching the upstream VBVRBench output. |
13 | 13 |
|
14 | 14 | Environment variables |
15 | 15 | --------------------- |
16 | | -- ``VBVR_GT_PATH``: local root of the downloaded Video-Reason/VBVR-Bench-Data |
17 | | - dataset. Must contain ``In-Domain_50/`` and ``Out-of-Domain_50/`` folders with |
18 | | - ``{task_name}/{video_idx}/{first_frame.png,final_frame.png,ground_truth.mp4,prompt.txt}``. |
| 16 | +- ``VBVR_GT_PATH``: optional local root of the downloaded |
| 17 | + Video-Reason/VBVR-Bench-Data dataset. If unset, the task uses the configured |
| 18 | + lmms-eval/Hugging Face cache directory and falls back to ``snapshot_download``. |
19 | 19 | """ |
20 | 20 |
|
21 | 21 | from __future__ import annotations |
|
26 | 26 | import os |
27 | 27 | import re |
28 | 28 | from collections import defaultdict |
| 29 | +from functools import lru_cache |
| 30 | +from pathlib import Path |
29 | 31 | from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple |
30 | 32 |
|
31 | 33 | import numpy as np |
| 34 | +import yaml |
| 35 | +from huggingface_hub import snapshot_download |
32 | 36 | from loguru import logger as eval_logger |
33 | 37 | from PIL import Image |
34 | 38 |
|
| 39 | +from lmms_eval import utils as lmms_utils |
| 40 | +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file |
35 | 41 | from lmms_eval.tasks.vbvr.vbvr_bench.evaluators import ( |
36 | 42 | get_evaluator, |
37 | 43 | get_split, |
|
44 | 50 | _BASE64_PREFIX = re.compile(r"^data:image/[^;]+;base64,", re.IGNORECASE) |
45 | 51 |
|
46 | 52 |
|
| 53 | +@lru_cache(maxsize=1) |
| 54 | +def _task_config() -> Dict[str, Any]: |
| 55 | + with open(Path(__file__).parent / "_default_template_yaml", "r", encoding="utf-8") as f: |
| 56 | + safe_data = [line for line in f if "!function" not in line] |
| 57 | + return yaml.safe_load("".join(safe_data)) or {} |
| 58 | + |
| 59 | + |
| 60 | +def _dataset_repo_id() -> str: |
| 61 | + return str(_task_config()["dataset_path"]) |
| 62 | + |
| 63 | + |
| 64 | +def _cache_dir_name() -> str: |
| 65 | + return str(_task_config()["dataset_kwargs"]["cache_dir"]) |
| 66 | + |
| 67 | + |
| 68 | +def _looks_like_vbvr_root(root: str) -> bool: |
| 69 | + return all(os.path.isdir(os.path.join(root, split)) for split in ("In-Domain_50", "Out-of-Domain_50")) |
| 70 | + |
| 71 | + |
| 72 | +@lru_cache(maxsize=1) |
47 | 73 | def _gt_root() -> str: |
48 | 74 | root = os.getenv("VBVR_GT_PATH") |
49 | | - if not root: |
50 | | - raise RuntimeError("VBVR_GT_PATH is not set. Download the GT with:\n" " hf download Video-Reason/VBVR-Bench-Data --repo-type dataset --local-dir <path>\n" "then `export VBVR_GT_PATH=<path>`.") |
51 | | - return os.path.expanduser(os.path.expandvars(root)) |
| 75 | + if root: |
| 76 | + root = os.path.expanduser(os.path.expandvars(root)) |
| 77 | + if _looks_like_vbvr_root(root): |
| 78 | + return root |
| 79 | + raise RuntimeError(f"VBVR_GT_PATH does not look like a VBVR-Bench checkout: {root}") |
| 80 | + |
| 81 | + hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface")) |
| 82 | + cache_root = lmms_utils.resolve_cache_dir(_cache_dir_name(), base_dir=hf_home) |
| 83 | + if _looks_like_vbvr_root(cache_root): |
| 84 | + return cache_root |
| 85 | + |
| 86 | + snapshot_root = snapshot_download(repo_id=_dataset_repo_id(), repo_type="dataset") |
| 87 | + if _looks_like_vbvr_root(snapshot_root): |
| 88 | + return snapshot_root |
| 89 | + raise RuntimeError(f"Could not locate VBVR GT files in {cache_root} or HF snapshot {snapshot_root}.") |
52 | 90 |
|
53 | 91 |
|
54 | 92 | def _decode_base64_image(data: str) -> Image.Image: |
@@ -169,6 +207,7 @@ def _fanout_metrics(entry: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: |
169 | 207 | "vbvr_perception": entry, |
170 | 208 | "vbvr_spatiality": entry, |
171 | 209 | "vbvr_transformation": entry, |
| 210 | + "submission": entry, |
172 | 211 | } |
173 | 212 |
|
174 | 213 |
|
@@ -237,6 +276,42 @@ def _agg_by(results, key: str, value: str, label: str) -> float: |
237 | 276 | return mean |
238 | 277 |
|
239 | 278 |
|
| 279 | +def _summary(entries: Sequence[Dict[str, Any]]) -> Dict[str, Any]: |
| 280 | + scores = [float(e["score"]) for e in entries if isinstance(e.get("score"), (int, float))] |
| 281 | + summary: Dict[str, Any] = { |
| 282 | + "overall": _mean(scores), |
| 283 | + "n": len(scores), |
| 284 | + } |
| 285 | + for split in SPLITS: |
| 286 | + split_scores = [float(e["score"]) for e in entries if e.get("split") == split and isinstance(e.get("score"), (int, float))] |
| 287 | + summary[split] = {"score": _mean(split_scores), "n": len(split_scores)} |
| 288 | + for category in CATEGORIES: |
| 289 | + category_scores = [float(e["score"]) for e in entries if e.get("category") == category and isinstance(e.get("score"), (int, float))] |
| 290 | + summary[category.lower()] = {"score": _mean(category_scores), "n": len(category_scores)} |
| 291 | + return summary |
| 292 | + |
| 293 | + |
| 294 | +def _submission_file_name(entries: Sequence[Dict[str, Any]]) -> str: |
| 295 | + splits = {e.get("split") for e in entries if e.get("split")} |
| 296 | + if splits == {"In_Domain"}: |
| 297 | + return "vbvr_in_domain_eval_results.json" |
| 298 | + if splits == {"Out_of_Domain"}: |
| 299 | + return "vbvr_out_of_domain_eval_results.json" |
| 300 | + return "vbvr_eval_results.json" |
| 301 | + |
| 302 | + |
| 303 | +def vbvr_aggregate_submission(results, args) -> None: |
| 304 | + entries = sorted(_entries(results), key=lambda e: (str(e.get("file_split", "")), str(e.get("task_name", "")), str(e.get("video_idx", "")))) |
| 305 | + path = generate_submission_file(_submission_file_name(entries), args) |
| 306 | + payload = { |
| 307 | + "summary": _summary(entries), |
| 308 | + "results": entries, |
| 309 | + } |
| 310 | + with open(path, "w", encoding="utf-8") as f: |
| 311 | + json.dump(payload, f, indent=2) |
| 312 | + eval_logger.info(f"[VBVR] Detailed evaluation results saved to {path}") |
| 313 | + |
| 314 | + |
240 | 315 | def vbvr_aggregate_overall(results) -> float: |
241 | 316 | entries = _entries(results) |
242 | 317 | if not entries: |
|
0 commit comments