Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ python -m lmms_eval --tasks list_with_num
- COCO 2017 Caption MiniVal (coco2017_cap_val)
- COCO 2017 Caption MiniTest (coco2017_cap_test)
- [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench)
- [CountBench](https://huggingface.co/datasets/vikhyatk/CountBenchQA) (countbench)
- [CV-Bench](https://github.com/nyu-visionx/CV-Bench) (cv_bench)
- [DetailCaps-4870](https://github.com/foundation-multimodal-models/CAPTURE) (detailcaps)
- [Flickr30K](https://github.com/BryanPlummer/flickr30k_entities) (flickr30k)
Expand Down
22 changes: 22 additions & 0 deletions lmms_eval/tasks/countbench/countbench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset_path: vikhyatk/CountBenchQA
task: countbench
test_split: test
output_type: generate_until
doc_to_visual: !function utils.countbench_doc_to_visual
doc_to_text: !function utils.countbench_doc_to_text
doc_to_target: !function utils.countbench_doc_to_target
generation_kwargs:
max_new_tokens: 16
temperature: 0
do_sample: false
process_results: !function utils.countbench_process_results
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: "Look at the image carefully and count the objects. Answer with just a number, without any additional text. "
post_prompt: ""
metadata:
- version: 0.0
51 changes: 51 additions & 0 deletions lmms_eval/tasks/countbench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
NUMBER_WORD_TO_NUMERAL = {
"none": "0",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
"eleven": "11",
"twelve": "12",
"thirteen": "13",
"fourteen": "14",
"fifteen": "15",
"sixteen": "16",
"seventeen": "17",
"eighteen": "18",
"nineteen": "19",
"twenty": "20",
}


def _normalize_count_answer(answer) -> str:
normalized = str(answer).strip().lower()
return NUMBER_WORD_TO_NUMERAL.get(normalized, normalized)


def countbench_doc_to_visual(doc):
return [doc["image"].convert("RGB")]


def countbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
kwargs = lmms_eval_specific_kwargs or {}
pre_prompt = kwargs.get("pre_prompt", "")
post_prompt = kwargs.get("post_prompt", "")
question = doc["question"].strip()
return f"{pre_prompt}{question}{post_prompt}"


def countbench_doc_to_target(doc):
return _normalize_count_answer(doc["number"])


def countbench_process_results(doc, results):
prediction = _normalize_count_answer(results[0])
target = countbench_doc_to_target(doc)
return {"acc": float(prediction == target)}