[Benchmark Backfill] Integrate CountBench into lmms-eval (#1156)

Luodian · web-flow · commit 0b71775acd48 · 2026-02-23T15:20:05.000+08:00
* feat: add CountBench task config and scoring

* docs: add CountBench to current task catalog
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -47,6 +47,7 @@ python -m lmms_eval --tasks list_with_num
     - COCO 2017 Caption MiniVal (coco2017_cap_val)
     - COCO 2017 Caption MiniTest (coco2017_cap_test)
 - [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench)
+- [CountBench](https://huggingface.co/datasets/vikhyatk/CountBenchQA) (countbench)
 - [CV-Bench](https://github.com/nyu-visionx/CV-Bench) (cv_bench)
 - [DetailCaps-4870](https://github.com/foundation-multimodal-models/CAPTURE) (detailcaps)
 - [Flickr30K](https://github.com/BryanPlummer/flickr30k_entities) (flickr30k)
diff --git a/lmms_eval/tasks/countbench/countbench.yaml b/lmms_eval/tasks/countbench/countbench.yaml
@@ -0,0 +1,22 @@
+dataset_path: vikhyatk/CountBenchQA
+task: countbench
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.countbench_doc_to_visual
+doc_to_text: !function utils.countbench_doc_to_text
+doc_to_target: !function utils.countbench_doc_to_target
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: false
+process_results: !function utils.countbench_process_results
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Look at the image carefully and count the objects. Answer with just a number, without any additional text. "
+    post_prompt: ""
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/countbench/utils.py b/lmms_eval/tasks/countbench/utils.py
@@ -0,0 +1,51 @@
+NUMBER_WORD_TO_NUMERAL = {
+    "none": "0",
+    "zero": "0",
+    "one": "1",
+    "two": "2",
+    "three": "3",
+    "four": "4",
+    "five": "5",
+    "six": "6",
+    "seven": "7",
+    "eight": "8",
+    "nine": "9",
+    "ten": "10",
+    "eleven": "11",
+    "twelve": "12",
+    "thirteen": "13",
+    "fourteen": "14",
+    "fifteen": "15",
+    "sixteen": "16",
+    "seventeen": "17",
+    "eighteen": "18",
+    "nineteen": "19",
+    "twenty": "20",
+}
+
+
+def _normalize_count_answer(answer) -> str:
+    normalized = str(answer).strip().lower()
+    return NUMBER_WORD_TO_NUMERAL.get(normalized, normalized)
+
+
+def countbench_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def countbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "")
+    question = doc["question"].strip()
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def countbench_doc_to_target(doc):
+    return _normalize_count_answer(doc["number"])
+
+
+def countbench_process_results(doc, results):
+    prediction = _normalize_count_answer(results[0])
+    target = countbench_doc_to_target(doc)
+    return {"acc": float(prediction == target)}