feat: integrate MathKangaroo benchmark task (#1135)

Luodian · Luodian · commit 06728565fb87 · 2026-02-22T20:55:31.000+08:00
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -437,6 +437,7 @@ python -m lmms_eval --tasks list_with_num
 - [AIME](https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions) (aime)
 - [DynaMath](https://dynamath.github.io/) (dynamath)
 - [GSM8K](https://github.com/openai/grade-school-math) (gsm8k)
+- [MathKangaroo](https://huggingface.co/datasets/dfkiuser/kangaroo_math_mc_questions) (mathkangaroo)
 - [MathVerse](https://github.com/ZrrSkywalker/MathVerse) (mathverse)
   - MathVerse Text Dominant (mathverse_testmini_text_dominant)
   - MathVerse Text Only (mathverse_testmini_text_only)
diff --git a/lmms_eval/tasks/mathkangaroo/mathkangaroo.yaml b/lmms_eval/tasks/mathkangaroo/mathkangaroo.yaml
@@ -0,0 +1,25 @@
+dataset_path: dfkiuser/kangaroo_math_mc_questions
+dataset_kwargs:
+  token: True
+task: mathkangaroo
+tag: visual_reasoning_collection
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.mathkangaroo_doc_to_visual
+doc_to_text: !function utils.mathkangaroo_doc_to_text
+doc_to_target: "ground_truth"
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  do_sample: false
+process_results: !function utils.mathkangaroo_process_results
+metric_list:
+  - metric: mathkangaroo_accuracy
+    aggregation: mean
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option letter (A, B, C, D, or E) only."
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/mathkangaroo/utils.py b/lmms_eval/tasks/mathkangaroo/utils.py
@@ -0,0 +1,53 @@
+import re
+from typing import Any
+
+
+def mathkangaroo_doc_to_visual(doc):
+    image = doc.get("image")
+    if image is not None and hasattr(image, "convert"):
+        return [image.convert("RGB")]
+    return []
+
+
+def mathkangaroo_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "\nAnswer with the option letter (A, B, C, D, or E) only.")
+    question = str(doc.get("question", "")).strip()
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def _normalize_targets(answer: Any) -> set[str]:
+    if answer is None:
+        return set()
+    return set(re.findall(r"[A-E]", str(answer).upper()))
+
+
+def _extract_prediction(response: str) -> str:
+    if not response:
+        return ""
+
+    text = str(response).strip()
+    direct_match = re.search(r"(?i)(?:final\s+answer|answer|option)\s*(?:is|:)?\s*\(?([A-E])\)?", text)
+    if direct_match:
+        return direct_match.group(1).upper()
+
+    for line in reversed(text.splitlines()):
+        line = line.strip().upper()
+        if not line:
+            continue
+        line_match = re.fullmatch(r"\(?([A-E])\)?[\.)]?", line)
+        if line_match:
+            return line_match.group(1)
+
+    candidates = re.findall(r"\b([A-E])\b", text.upper())
+    if candidates:
+        return candidates[-1]
+    return ""
+
+
+def mathkangaroo_process_results(doc, results):
+    prediction = _extract_prediction(results[0] if results else "")
+    targets = _normalize_targets(doc.get("ground_truth"))
+    score = 1.0 if prediction and prediction in targets else 0.0
+    return {"mathkangaroo_accuracy": score}