Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ python -m lmms_eval --tasks list_with_num
- [EmbSpatial](https://github.com/EmbSpatial/EmbSpatial) (embspatial)
- [ERQA](https://github.com/ERQA-Bench/ERQA) (erqa)
- [OmniSpatial](https://omnispatial.github.io/) (omnispatial)
- [Point-Bench](https://pointarena.github.io/) (pointbench)
- [Where2Place](https://where2place.github.io/) (where2place)

---
Expand Down
22 changes: 22 additions & 0 deletions lmms_eval/tasks/pointbench/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset_path: json
dataset_kwargs:
data_files:
train: https://huggingface.co/datasets/PointArena/pointarena-data/raw/main/data.json
output_type: generate_until
process_docs: !function utils.pointbench_process_docs
doc_to_visual: !function utils.pointbench_doc_to_visual
doc_to_text: !function utils.pointbench_doc_to_text
doc_to_target: ""
process_results: !function utils.pointbench_process_results

metric_list:
- metric: pointbench_acc
aggregation: !function utils.pointbench_aggregate_results
higher_is_better: true

generation_kwargs:
max_new_tokens: 256

metadata:
source_dataset: PointArena/pointarena-data
prompt_suffix_type: "0_999"
3 changes: 3 additions & 0 deletions lmms_eval/tasks/pointbench/pointbench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task: pointbench
test_split: train
include: _default_template_yaml
171 changes: 171 additions & 0 deletions lmms_eval/tasks/pointbench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import re
import zipfile
from functools import lru_cache
from io import BytesIO
from typing import Any, Dict, List

import datasets
import numpy as np
import requests
from PIL import Image

from lmms_eval.tasks._task_utils.default_template_yaml import load_default_template_yaml
from lmms_eval.utils import eval_logger

POINTARENA_REPO = "PointArena/pointarena-data"
POINTARENA_ROWS_API = "https://datasets-server.huggingface.co/rows"

PROMPT_SUFFIX_0_999 = "Your answer should be formatted as a list of tuples, i.e. [(x1, y1), (x2, y2), ...], where each tuple contains the x and y coordinates of a point satisfying the conditions above. The coordinates should be integers between 0 and 999, representing the pixel locations scaled to a 1000x1000 grid."
PROMPT_SUFFIX_ORIGINAL = "Your answer should be formatted as a list of tuples, i.e. [(x1, y1), (x2, y2), ...], where each tuple contains the x and y coordinates of a point satisfying the conditions above. The coordinates should be between 0 and 1, indicating the normalized pixel locations of the points in the image."
FORMAT = "Return only list of tuples, do not add anything else."

config = load_default_template_yaml(__file__)


def pointbench_process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
return dataset.map(lambda _, idx: {"question_id": idx, "row_idx": idx}, with_indices=True)


def pointbench_doc_to_text(doc: Dict[str, Any], lmms_eval_specific_kwargs: Dict[str, Any] | None = None) -> str:
prompt_suffix_type = config.get("metadata", {}).get("prompt_suffix_type", "0_999")
suffix = PROMPT_SUFFIX_0_999 if prompt_suffix_type == "0_999" else PROMPT_SUFFIX_ORIGINAL

kwargs = lmms_eval_specific_kwargs or {}
pre_prompt = kwargs.get("pre_prompt", "")
post_prompt = kwargs.get("post_prompt", "")
user_input = str(doc.get("user_input", "")).strip()
return f"{pre_prompt}{user_input} {suffix} {FORMAT}{post_prompt}".strip()


@lru_cache(maxsize=4096)
def _get_image_url(row_idx: int) -> str:
response = requests.get(
POINTARENA_ROWS_API,
params={"dataset": POINTARENA_REPO, "config": "default", "split": "train", "offset": int(row_idx), "length": 1},
timeout=30,
)
response.raise_for_status()
payload = response.json()
rows = payload.get("rows", [])
if not rows:
raise ValueError(f"No rows found for row_idx={row_idx}")
return rows[0]["row"]["image"]["src"]


def _load_image(row_idx: int) -> Image.Image:
image_url = _get_image_url(row_idx)
response = requests.get(image_url, timeout=60)
if response.status_code == 403:
_get_image_url.cache_clear()
image_url = _get_image_url(row_idx)
response = requests.get(image_url, timeout=60)
response.raise_for_status()
return Image.open(BytesIO(response.content)).convert("RGB")


def pointbench_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]:
row_idx = doc.get("row_idx", doc.get("question_id"))
if row_idx is None:
eval_logger.warning("pointbench: missing row_idx for doc={}", doc.get("image_filename", "unknown"))
return []

try:
image = _load_image(int(row_idx))
except Exception as exc:
eval_logger.warning("pointbench: failed to load image for row_idx={} ({})", row_idx, exc)
return []
return [image]


@lru_cache(maxsize=1)
def _mask_zip_path() -> str:
from huggingface_hub import hf_hub_download

return hf_hub_download(repo_id=POINTARENA_REPO, repo_type="dataset", filename="selected_masks.zip")


@lru_cache(maxsize=1)
def _mask_member_map() -> Dict[str, str]:
mapping: Dict[str, str] = {}
with zipfile.ZipFile(_mask_zip_path()) as archive:
for member in archive.namelist():
if not member.lower().endswith(".png"):
continue
mapping.setdefault(member.rsplit("/", 1)[-1], member)
return mapping


@lru_cache(maxsize=4096)
def _load_mask(mask_filename: str) -> np.ndarray | None:
member = _mask_member_map().get(mask_filename)
if not member:
return None

with zipfile.ZipFile(_mask_zip_path()) as archive:
with archive.open(member) as stream:
mask = Image.open(BytesIO(stream.read())).convert("L")

return (np.array(mask) > 127).astype(np.int32)


def _text_to_points(text: str, width: int, height: int) -> np.ndarray:
pattern = r"\(([-+]?\d*\.?\d+)\s*,\s*([-+]?\d*\.?\d+)\)"
matches = re.findall(pattern, text)

points = []
for x_raw, y_raw in matches:
x = float(x_raw)
y = float(y_raw)

if 0.0 <= x <= 1.0 and 0.0 <= y <= 1.0:
px = int(round(x * width))
py = int(round(y * height))
elif 0.0 <= x <= 1000.0 and 0.0 <= y <= 1000.0:
px = int(round((x / 1000.0) * width))
py = int(round((y / 1000.0) * height))
else:
px = int(round(x))
py = int(round(y))

points.append((px, py))

return np.array(points, dtype=np.int32)


def pointbench_process_results(doc: Dict[str, Any], result: List[str]) -> Dict[str, Dict[str, Any]]:
key_name = "pointbench_acc"
mask_filename = str(doc.get("mask_filename", ""))
mask = _load_mask(mask_filename)
response = result[0] if result else ""

if mask is None:
eval_logger.warning("pointbench: failed to find mask for file={}", mask_filename)
submission = {
"id": doc.get("question_id", doc.get("image_filename", "unknown")),
"pred": response,
"parsed_points": [],
"accuracy": 0.0,
"category": doc.get("category", "unknown"),
}
return {key_name: submission}

points = _text_to_points(response, mask.shape[1], mask.shape[0])
acc = 0.0
if len(points) > 0:
in_range = (points[:, 0] >= 0) & (points[:, 0] < mask.shape[1]) & (points[:, 1] >= 0) & (points[:, 1] < mask.shape[0])
acc = np.concatenate([mask[points[in_range, 1], points[in_range, 0]], np.zeros(points.shape[0] - in_range.sum())]).mean()

submission = {
"id": doc.get("question_id", doc.get("image_filename", "unknown")),
"pred": response,
"parsed_points": list(map(tuple, points.tolist())),
"accuracy": float(acc),
"category": doc.get("category", "unknown"),
}
return {key_name: submission}


def pointbench_aggregate_results(results: List[Dict[str, Any]]) -> float:
if not results:
return 0.0
return float(np.mean([sample.get("accuracy", 0.0) for sample in results]))