Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ python -m lmms_eval --tasks list_with_num
- [SNS-Bench](https://github.com/SNS-Bench/SNS-Bench) (snsbench)
- [TOMATO](https://github.com/TOMATO-Lab/TOMATO) (tomato)
- [VMC-Bench](https://github.com/VMC-Bench/VMC-Bench) (vmcbench)
- [ViVerBench](https://huggingface.co/datasets/comin/ViVerBench) (viverbench)
- [Visual Puzzles](https://github.com/VisualPuzzles/VisualPuzzles) (VisualPuzzles)
- [VisualWebBench](https://visualwebbench.github.io/) (visualwebbench)
- [V*-Bench](https://github.com/V-Bench/V-Bench) (vstar_bench)
Expand Down
22 changes: 22 additions & 0 deletions lmms_eval/tasks/viverbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# ViVerBench

ViVerBench evaluates whether multimodal models can verify if generated visual outputs satisfy prompt-level constraints.

- Paper: [Generative Universal Verifier as Multimodal Meta-Reasoner](https://huggingface.co/papers/2510.13804)
- Dataset: [comin/ViVerBench](https://huggingface.co/datasets/comin/ViVerBench)

## Overview

- 3,594 examples across 16 task categories
- Binary verification target (`true` / `false`)
- Inputs can contain multiple images (1, 2, or 8)

## Usage

```bash
python -m lmms_eval \
--model <model_name> \
--tasks viverbench \
--batch_size 1 \
--limit 8
```
27 changes: 27 additions & 0 deletions lmms_eval/tasks/viverbench/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
dataset_path: comin/ViVerBench

output_type: generate_until
doc_to_visual: !function utils.viverbench_doc_to_visual
doc_to_text: !function utils.viverbench_doc_to_text
doc_to_target: "answer"

generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false

process_results: !function utils.viverbench_process_results
metric_list:
- metric: viverbench_acc
aggregation: !function utils.viverbench_aggregate_results
higher_is_better: true

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""

metadata:
- version: 0.0
145 changes: 145 additions & 0 deletions lmms_eval/tasks/viverbench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import ast
import json
import re
from collections import defaultdict
from io import BytesIO
from typing import Any, Optional

from loguru import logger as eval_logger
from PIL import Image


def _coerce_bool(value: Any) -> Optional[bool]:
if isinstance(value, bool):
return value

if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"true", "yes", "y", "1"}:
return True
if normalized in {"false", "no", "n", "0"}:
return False

return None


def _parse_bool_from_serialized(candidate: str) -> Optional[bool]:
candidate = candidate.strip()
if not candidate:
return None

for parser in (json.loads, ast.literal_eval):
try:
parsed = parser(candidate)
except Exception:
continue

if isinstance(parsed, dict):
for key in ("answer", "Answer", "ANSWER"):
if key in parsed:
return _coerce_bool(parsed[key])
else:
parsed_bool = _coerce_bool(parsed)
if parsed_bool is not None:
return parsed_bool

return None


def _parse_bool_from_response(response: str) -> Optional[bool]:
if not response:
return None

cleaned = response.strip()

direct = _coerce_bool(cleaned.strip("`\"' "))
if direct is not None:
return direct

serialized_candidates = [cleaned]
serialized_candidates.extend(re.findall(r"```(?:json)?\s*(.*?)\s*```", cleaned, flags=re.IGNORECASE | re.DOTALL))
serialized_candidates.extend(re.findall(r"\{[\s\S]*?\}", cleaned))

for candidate in serialized_candidates:
parsed = _parse_bool_from_serialized(candidate)
if parsed is not None:
return parsed

lowered = cleaned.lower()

answer_match = re.search(r'"answer"\s*:\s*(true|false)', lowered)
if answer_match:
return answer_match.group(1) == "true"

answer_match = re.search(r"\banswer\s*[:=]\s*(true|false)\b", lowered)
if answer_match:
return answer_match.group(1) == "true"

token_match = re.search(r"\b(true|false|yes|no)\b", lowered)
if token_match:
return _coerce_bool(token_match.group(1))

return None


def viverbench_doc_to_visual(doc: dict[str, Any]) -> list:
visuals = []
for image_bytes in doc.get("img", []):
image = Image.open(BytesIO(image_bytes)).convert("RGB")
visuals.append(image)
return visuals


def viverbench_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: Optional[dict[str, Any]] = None) -> str:
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}

pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
question = doc["question"]

text_parts = [part for part in (pre_prompt, question, post_prompt) if part]
return "\n".join(text_parts)


def viverbench_process_results(doc: dict[str, Any], results: list[str]) -> dict[str, dict[str, Any]]:
response = results[0] if results else ""
pred_answer = _parse_bool_from_response(response)
target_answer = bool(doc["answer"])

submission = {
"prompt_id": doc.get("prompt_id", ""),
"task": doc.get("task", "unknown"),
"target_answer": target_answer,
"pred_answer": pred_answer,
"raw_response": response,
"is_correct": pred_answer is not None and pred_answer == target_answer,
}

return {"viverbench_acc": submission}


def viverbench_aggregate_results(results: list[dict[str, Any]]) -> float:
if not results:
return 0.0

by_task = defaultdict(lambda: {"correct": 0, "total": 0})
total_correct = 0

for result in results:
task = result.get("task", "unknown")
is_correct = bool(result.get("is_correct", False))

by_task[task]["total"] += 1
if is_correct:
by_task[task]["correct"] += 1
total_correct += 1

for task in sorted(by_task.keys()):
stats = by_task[task]
task_acc = stats["correct"] / stats["total"] if stats["total"] else 0.0
eval_logger.info(f"ViVerBench - {task}: {task_acc:.4f} ({stats['correct']}/{stats['total']})")

overall_acc = total_correct / len(results)
eval_logger.info(f"ViVerBench - overall: {overall_acc:.4f} ({total_correct}/{len(results)})")
return overall_acc
3 changes: 3 additions & 0 deletions lmms_eval/tasks/viverbench/viverbench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task: "viverbench"
test_split: train
include: _default_template_yaml