Skip to content

Commit 0672856

Browse files
committed
feat: integrate MathKangaroo benchmark task (#1135)
1 parent f729caf commit 0672856

3 files changed

Lines changed: 79 additions & 0 deletions

File tree

docs/current_tasks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ python -m lmms_eval --tasks list_with_num
437437
- [AIME](https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions) (aime)
438438
- [DynaMath](https://dynamath.github.io/) (dynamath)
439439
- [GSM8K](https://github.com/openai/grade-school-math) (gsm8k)
440+
- [MathKangaroo](https://huggingface.co/datasets/dfkiuser/kangaroo_math_mc_questions) (mathkangaroo)
440441
- [MathVerse](https://github.com/ZrrSkywalker/MathVerse) (mathverse)
441442
- MathVerse Text Dominant (mathverse_testmini_text_dominant)
442443
- MathVerse Text Only (mathverse_testmini_text_only)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
dataset_path: dfkiuser/kangaroo_math_mc_questions
2+
dataset_kwargs:
3+
token: True
4+
task: mathkangaroo
5+
tag: visual_reasoning_collection
6+
test_split: train
7+
output_type: generate_until
8+
doc_to_visual: !function utils.mathkangaroo_doc_to_visual
9+
doc_to_text: !function utils.mathkangaroo_doc_to_text
10+
doc_to_target: "ground_truth"
11+
generation_kwargs:
12+
max_new_tokens: 64
13+
temperature: 0
14+
do_sample: false
15+
process_results: !function utils.mathkangaroo_process_results
16+
metric_list:
17+
- metric: mathkangaroo_accuracy
18+
aggregation: mean
19+
higher_is_better: true
20+
lmms_eval_specific_kwargs:
21+
default:
22+
pre_prompt: ""
23+
post_prompt: "\nAnswer with the option letter (A, B, C, D, or E) only."
24+
metadata:
25+
- version: 0.0
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import re
2+
from typing import Any
3+
4+
5+
def mathkangaroo_doc_to_visual(doc):
6+
image = doc.get("image")
7+
if image is not None and hasattr(image, "convert"):
8+
return [image.convert("RGB")]
9+
return []
10+
11+
12+
def mathkangaroo_doc_to_text(doc, lmms_eval_specific_kwargs=None):
13+
kwargs = lmms_eval_specific_kwargs or {}
14+
pre_prompt = kwargs.get("pre_prompt", "")
15+
post_prompt = kwargs.get("post_prompt", "\nAnswer with the option letter (A, B, C, D, or E) only.")
16+
question = str(doc.get("question", "")).strip()
17+
return f"{pre_prompt}{question}{post_prompt}"
18+
19+
20+
def _normalize_targets(answer: Any) -> set[str]:
21+
if answer is None:
22+
return set()
23+
return set(re.findall(r"[A-E]", str(answer).upper()))
24+
25+
26+
def _extract_prediction(response: str) -> str:
27+
if not response:
28+
return ""
29+
30+
text = str(response).strip()
31+
direct_match = re.search(r"(?i)(?:final\s+answer|answer|option)\s*(?:is|:)?\s*\(?([A-E])\)?", text)
32+
if direct_match:
33+
return direct_match.group(1).upper()
34+
35+
for line in reversed(text.splitlines()):
36+
line = line.strip().upper()
37+
if not line:
38+
continue
39+
line_match = re.fullmatch(r"\(?([A-E])\)?[\.)]?", line)
40+
if line_match:
41+
return line_match.group(1)
42+
43+
candidates = re.findall(r"\b([A-E])\b", text.upper())
44+
if candidates:
45+
return candidates[-1]
46+
return ""
47+
48+
49+
def mathkangaroo_process_results(doc, results):
50+
prediction = _extract_prediction(results[0] if results else "")
51+
targets = _normalize_targets(doc.get("ground_truth"))
52+
score = 1.0 if prediction and prediction in targets else 0.0
53+
return {"mathkangaroo_accuracy": score}

0 commit comments

Comments
 (0)