Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions lmms_eval/tasks/revsi/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
dataset_path: 3dlg-hcvc/ReVSI
test_split: test
dataset_kwargs:
token: True
cache_dir: revsi
video: True
output_type: generate_until
process_docs: !function utils.process_docs
doc_to_visual: !function utils.revsi_doc_to_visual
doc_to_text: !function utils.revsi_doc_to_text
doc_to_target: "ground_truth"
generation_kwargs:
max_new_tokens: 16
temperature: 0
do_sample: false
process_results: !function utils.revsi_process_results
metric_list:
- metric: overall_acc
aggregation: !function utils.revsi_aggregate_overall
higher_is_better: true
- metric: object_abs_distance_acc
aggregation: !function utils.revsi_aggregate_object_abs_distance_acc
higher_is_better: true
- metric: object_counting_acc
aggregation: !function utils.revsi_aggregate_object_counting_acc
higher_is_better: true
- metric: object_rel_direction_acc
aggregation: !function utils.revsi_aggregate_object_rel_direction_acc
higher_is_better: true
- metric: object_rel_distance_acc
aggregation: !function utils.revsi_aggregate_object_rel_distance_acc
higher_is_better: true
- metric: object_size_estimation_acc
aggregation: !function utils.revsi_aggregate_object_size_estimation_acc
higher_is_better: true
- metric: room_size_estimation_acc
aggregation: !function utils.revsi_aggregate_room_size_estimation_acc
higher_is_better: true
- metric: route_planning_acc
aggregation: !function utils.revsi_aggregate_route_planning_acc
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: "These are frames of a video."
mcq_post_prompt: "Answer with the option's letter from the given choices directly."
nq_post_prompt: "Answer the question using a single integer or decimal number."
metadata:
- version: 1.0
6 changes: 6 additions & 0 deletions lmms_eval/tasks/revsi/revsi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
group: revsi
task:
- revsi_all_frame
- revsi_64_frame
- revsi_32_frame
- revsi_16_frame
3 changes: 3 additions & 0 deletions lmms_eval/tasks/revsi/revsi_16_frame.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dataset_name: 16_frame
task: revsi_16_frame
include: _default_template_yaml
3 changes: 3 additions & 0 deletions lmms_eval/tasks/revsi/revsi_32_frame.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dataset_name: 32_frame
task: revsi_32_frame
include: _default_template_yaml
3 changes: 3 additions & 0 deletions lmms_eval/tasks/revsi/revsi_64_frame.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dataset_name: 64_frame
task: revsi_64_frame
include: _default_template_yaml
3 changes: 3 additions & 0 deletions lmms_eval/tasks/revsi/revsi_all_frame.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dataset_name: all_frame
task: revsi_all_frame
include: _default_template_yaml
171 changes: 171 additions & 0 deletions lmms_eval/tasks/revsi/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import os
import datasets
import numpy as np
import pandas as pd
from huggingface_hub.constants import HF_HOME
from lmms_eval.utils import resolve_cache_dir
from lmms_eval.tasks._task_utils.default_template_yaml import load_default_template_yaml


MCQ_QUESTION_TYPES = [
"object_rel_direction_forward_easy",
"object_rel_direction_backward_easy",
"object_rel_direction_forward_hard",
"object_rel_direction_backward_hard",
"object_rel_distance_closest",
"object_rel_distance_farthest",
"route_planning",
]


NQ_QUESTION_TYPES = [
"object_counting_single",
"object_counting_multiple",
"object_abs_distance",
"object_size_estimation",
"room_size_estimation_single",
"room_size_estimation_multiple"
]


REVSI_METRICS = [
"overall_acc",
"object_abs_distance_acc",
"object_counting_acc",
"object_rel_direction_acc",
"object_rel_distance_acc",
"object_size_estimation_acc",
"room_size_estimation_acc",
"route_planning_acc",
]


COMPOSITE_METRICS = {
"object_rel_direction_acc": [
"object_rel_direction_forward_easy",
"object_rel_direction_backward_easy",
"object_rel_direction_forward_hard",
"object_rel_direction_backward_hard",
],
"object_rel_distance_acc": [
"object_rel_distance_closest",
"object_rel_distance_farthest",
],
"object_counting_acc": [
"object_counting_single",
"object_counting_multiple",
],
"room_size_estimation_acc": [
"room_size_estimation_single",
"room_size_estimation_multiple",
],
}


config = load_default_template_yaml(__file__)
cache_dir = resolve_cache_dir(config["dataset_kwargs"]["cache_dir"], base_dir=HF_HOME)


def revsi_doc_to_visual(doc):
video_path = os.path.join(cache_dir, f"{doc['num_frames']}_frame", f"{doc['scene_id']}.mp4")
if not os.path.exists(video_path):
raise FileExistsError(f"video path:{video_path} does not exist.")
return [video_path]


def revsi_doc_to_text(doc, lmms_eval_specific_kwargs=None):
question = doc["question"]
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
if doc["question_type"] in NQ_QUESTION_TYPES:
post_prompt = lmms_eval_specific_kwargs.get("nq_post_prompt", "")
return "\n".join([pre_prompt, question, post_prompt]).strip()
elif doc["question_type"] in MCQ_QUESTION_TYPES:
options = "Options:\n" + "\n".join(doc["options"])
post_prompt = lmms_eval_specific_kwargs.get("mcq_post_prompt", "")
return "\n".join([pre_prompt, question, options, post_prompt]).strip()


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
if os.getenv("LMMS_EVAL_SHUFFLE_DOCS", None):
return dataset.shuffle(seed=42)
return dataset


def _mean_relative_accuracy(pred, target, start, end, interval):
num_pts = (end - start) / interval + 2
conf_intervs = np.linspace(start, end, int(num_pts))
acc = (abs(pred - target) / target) <= (1 - conf_intervs)
return acc.mean()


def revsi_process_results(doc, results):
pred_answer = str(results[0]).strip().split(" ")[0].rstrip(".").strip()
gt_answer = doc["ground_truth"]
if doc["question_type"] in MCQ_QUESTION_TYPES:
acc = 1.0 if pred_answer.lower() == gt_answer.lower() else 0.0
elif doc["question_type"] in NQ_QUESTION_TYPES:
try:
acc = _mean_relative_accuracy(float(pred_answer), float(gt_answer), 0.5, 0.95, 0.05)
except:
acc = 0.0
doc["acc"] = acc
return {metric: doc for metric in REVSI_METRICS}


def _collapse_question_types(output, metric_name, question_types):
question_type_metrics = [
f"{question_type}_acc" for question_type in question_types if f"{question_type}_acc" in output
]
if not question_type_metrics:
return
output[metric_name] = np.mean([output.pop(metric) for metric in question_type_metrics])


def _compute_all_subscores(results) -> dict:
results = pd.DataFrame(results)
output = {
f"{question_type}_acc": per_question_type["acc"].mean()
for question_type, per_question_type in results.groupby("question_type")
}

for metric_name, question_types in COMPOSITE_METRICS.items():
_collapse_question_types(output, metric_name, question_types)

output["overall_acc"] = sum(output.values()) / len(output) if output else 0.0
return output


def _aggregate_metric(results, metric_name):
return _compute_all_subscores(results).get(metric_name, 0.0)


def revsi_aggregate_overall(results):
return _aggregate_metric(results, "overall_acc")


def revsi_aggregate_object_abs_distance_acc(results):
return _aggregate_metric(results, "object_abs_distance_acc")


def revsi_aggregate_object_counting_acc(results):
return _aggregate_metric(results, "object_counting_acc")


def revsi_aggregate_object_rel_direction_acc(results):
return _aggregate_metric(results, "object_rel_direction_acc")


def revsi_aggregate_object_rel_distance_acc(results):
return _aggregate_metric(results, "object_rel_distance_acc")


def revsi_aggregate_object_size_estimation_acc(results):
return _aggregate_metric(results, "object_size_estimation_acc")


def revsi_aggregate_room_size_estimation_acc(results):
return _aggregate_metric(results, "room_size_estimation_acc")


def revsi_aggregate_route_planning_acc(results):
return _aggregate_metric(results, "route_planning_acc")
Loading