Skip to content

Commit e8cde29

Browse files
authored
feat: integrate FSC-147 benchmark task (#1163)
1 parent 63bea01 commit e8cde29

3 files changed

Lines changed: 135 additions & 0 deletions

File tree

docs/current_tasks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ python -m lmms_eval --tasks list_with_num
5050
- [CountBench](https://huggingface.co/datasets/vikhyatk/CountBenchQA) (countbench)
5151
- [CV-Bench](https://github.com/nyu-visionx/CV-Bench) (cv_bench)
5252
- [DetailCaps-4870](https://github.com/foundation-multimodal-models/CAPTURE) (detailcaps)
53+
- [FSC-147](https://github.com/cvlab-stonybrook/LearningToCountEverything) (fsc147)
5354
- [Flickr30K](https://github.com/BryanPlummer/flickr30k_entities) (flickr30k)
5455
- Flickr30K Test (flickr30k_test)
5556
- [GQA](https://cs.stanford.edu/people/dorarad/gqa/index.html) (gqa)

lmms_eval/tasks/fsc147/fsc147.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
dataset_path: yifehuang97/CoCount-train-fsc147
2+
task: fsc147
3+
test_split: train
4+
output_type: generate_until
5+
doc_to_visual: !function utils.fsc147_doc_to_visual
6+
doc_to_text: !function utils.fsc147_doc_to_text
7+
doc_to_target: !function utils.fsc147_doc_to_target
8+
generation_kwargs:
9+
max_new_tokens: 32
10+
temperature: 0
11+
do_sample: false
12+
process_results: !function utils.fsc147_process_results
13+
metric_list:
14+
- metric: fsc147_exact_match
15+
aggregation: !function utils.fsc147_aggregate_exact_match
16+
higher_is_better: true
17+
- metric: fsc147_mae
18+
aggregation: !function utils.fsc147_aggregate_mae
19+
higher_is_better: false
20+
lmms_eval_specific_kwargs:
21+
default:
22+
pre_prompt: ""
23+
post_prompt: "\nAnswer with only an integer."
24+
metadata:
25+
- version: 0.0

lmms_eval/tasks/fsc147/utils.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import re
2+
from typing import Any
3+
4+
from PIL import Image
5+
6+
_COUNT_KEYS = ["annotated_pos_count", "pos_count", "count", "answer", "label", "gt_count", "gt_num"]
7+
_CAPTION_KEYS = ["pos_caption", "caption", "question", "query", "prompt", "text"]
8+
_IMAGE_KEYS = ["image", "img", "query_image"]
9+
10+
11+
def _to_rgb(image_obj: Any):
12+
if isinstance(image_obj, Image.Image):
13+
return image_obj.convert("RGB")
14+
return None
15+
16+
17+
def _to_int(value: Any):
18+
if value is None or isinstance(value, bool):
19+
return None
20+
21+
if isinstance(value, (int, float)):
22+
return int(round(float(value)))
23+
24+
text = str(value).strip().replace(",", "")
25+
if not text:
26+
return None
27+
28+
match = re.search(r"-?\d+(?:\.\d+)?", text)
29+
if not match:
30+
return None
31+
32+
try:
33+
return int(round(float(match.group(0))))
34+
except ValueError:
35+
return None
36+
37+
38+
def _extract_count(doc: dict):
39+
for key in _COUNT_KEYS:
40+
count = _to_int(doc.get(key))
41+
if count is not None:
42+
return count
43+
return None
44+
45+
46+
def _extract_caption(doc: dict) -> str:
47+
for key in _CAPTION_KEYS:
48+
value = doc.get(key)
49+
if isinstance(value, str) and value.strip():
50+
text = value.strip()
51+
text = re.sub(r"^[Tt]he\s+", "", text)
52+
return text.rstrip(". ")
53+
return "objects"
54+
55+
56+
def fsc147_doc_to_visual(doc):
57+
visuals = []
58+
for key in _IMAGE_KEYS:
59+
image_obj = _to_rgb(doc.get(key))
60+
if image_obj is not None:
61+
visuals.append(image_obj)
62+
break
63+
return visuals
64+
65+
66+
def fsc147_doc_to_text(doc, lmms_eval_specific_kwargs=None):
67+
kwargs = lmms_eval_specific_kwargs or {}
68+
pre_prompt = kwargs.get("pre_prompt", "")
69+
post_prompt = kwargs.get("post_prompt", "")
70+
71+
object_phrase = _extract_caption(doc)
72+
question = f"How many {object_phrase} are there in the image?"
73+
return f"{pre_prompt}{question}{post_prompt}"
74+
75+
76+
def fsc147_doc_to_target(doc):
77+
target = _extract_count(doc)
78+
if target is None:
79+
return ""
80+
return str(target)
81+
82+
83+
def fsc147_process_results(doc, results):
84+
prediction = str(results[0]).strip() if results else ""
85+
target_count = _extract_count(doc)
86+
87+
if target_count is None:
88+
return {"fsc147_exact_match": 0.0, "fsc147_mae": 0.0}
89+
90+
pred_count = _to_int(prediction)
91+
if pred_count is None:
92+
return {"fsc147_exact_match": 0.0, "fsc147_mae": float(abs(target_count))}
93+
94+
return {
95+
"fsc147_exact_match": float(pred_count == target_count),
96+
"fsc147_mae": float(abs(pred_count - target_count)),
97+
}
98+
99+
100+
def fsc147_aggregate_exact_match(items):
101+
if not items:
102+
return 0.0
103+
return sum(float(item) for item in items) / len(items)
104+
105+
106+
def fsc147_aggregate_mae(items):
107+
if not items:
108+
return 0.0
109+
return sum(float(item) for item in items) / len(items)

0 commit comments

Comments
 (0)