Skip to content

Commit ff32722

Browse files
authored
feat: add ARC-AGI-1, ARC-AGI-2, and BrowseComp benchmark tasks (#1190)
- ARC-AGI-1: visual reasoning using mertaylin/arc-agi-images (400 eval samples) - ARC-AGI-2: visual reasoning using vincentkoc/arc-agi-2 (120 eval samples) - BrowseComp: OpenAI info-finding benchmark with XOR-encrypted data and LLM judge Resolves LMM-306, LMM-307, LMM-273
1 parent 0d93809 commit ff32722

12 files changed

Lines changed: 801 additions & 0 deletions

File tree

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# ARC-AGI-1 (Image)
2+
3+
ARC-AGI-1 visual reasoning benchmark using community-rendered image dataset.
4+
5+
- Original: https://github.com/fchollet/ARC-AGI
6+
- Dataset: https://huggingface.co/datasets/mertaylin/arc-agi-images
7+
- Note: Uses community image conversion (not official ARC Prize packaging)
8+
9+
## Usage
10+
```bash
11+
python -m lmms_eval --tasks arc_agi_1 --model <model_name> --batch_size 1
12+
```
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
dataset_path: mertaylin/arc-agi-images
2+
dataset_kwargs:
3+
token: false
4+
output_type: generate_until
5+
doc_to_visual: !function utils.arc_agi_1_doc_to_visual
6+
doc_to_text: !function utils.arc_agi_1_doc_to_text
7+
doc_to_target: !function utils.arc_agi_1_doc_to_target
8+
generation_kwargs:
9+
max_new_tokens: 1024
10+
temperature: 0
11+
top_p: 1.0
12+
do_sample: false
13+
process_results: !function utils.arc_agi_1_process_results
14+
metric_list:
15+
- metric: arc_agi_1_acc
16+
aggregation: mean
17+
higher_is_better: true
18+
lmms_eval_specific_kwargs:
19+
default:
20+
pre_prompt: ""
21+
post_prompt: ""
22+
metadata:
23+
version: 0.0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
include: _default_template_yaml
2+
task: arc_agi_1
3+
test_split: eval

lmms_eval/tasks/arc_agi_1/utils.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import json
2+
import re
3+
from typing import Any
4+
5+
from loguru import logger as eval_logger
6+
from PIL import Image
7+
8+
CODE_FENCE_PATTERN = re.compile(r"```(?:json)?\s*(.*?)```", re.IGNORECASE | re.DOTALL)
9+
GRID_PATTERN = re.compile(r"\[\s*\[.*?\]\s*\]", re.DOTALL)
10+
11+
12+
def _extract_first_image(image_value):
13+
if isinstance(image_value, (list, tuple)):
14+
if len(image_value) == 0:
15+
return None
16+
image_value = image_value[0]
17+
18+
if isinstance(image_value, Image.Image):
19+
return image_value.convert("RGB")
20+
21+
if hasattr(image_value, "convert"):
22+
try:
23+
return image_value.convert("RGB")
24+
except Exception as err:
25+
eval_logger.warning("Failed to convert ARC-AGI image to RGB: {}", err)
26+
27+
return None
28+
29+
30+
def _extract_first_value(value: Any, default: Any = None) -> Any:
31+
if isinstance(value, (list, tuple)):
32+
if len(value) == 0:
33+
return default
34+
value = value[0]
35+
if value is None:
36+
return default
37+
return value
38+
39+
40+
def _is_grid_of_ints(value):
41+
if not isinstance(value, list):
42+
return False
43+
for row in value:
44+
if not isinstance(row, list):
45+
return False
46+
for cell in row:
47+
if isinstance(cell, bool) or not isinstance(cell, int):
48+
return False
49+
return True
50+
51+
52+
def _try_parse_grid(candidate):
53+
if not isinstance(candidate, str):
54+
return None
55+
56+
candidate = candidate.strip()
57+
if not candidate:
58+
return None
59+
60+
try:
61+
parsed = json.loads(candidate)
62+
except json.JSONDecodeError:
63+
return None
64+
65+
if _is_grid_of_ints(parsed):
66+
return parsed
67+
return None
68+
69+
70+
def _get_raw_solution_grid(doc):
71+
raw_solution = doc.get("raw_solution")
72+
target = _extract_first_value(raw_solution, default=[])
73+
if _is_grid_of_ints(target):
74+
return target
75+
eval_logger.warning("Invalid ARC-AGI raw_solution format for id={}", doc.get("id", "unknown"))
76+
return None
77+
78+
79+
def arc_agi_1_doc_to_visual(doc):
80+
stacked_train_image = _extract_first_image(doc.get("stacked_train_image"))
81+
test_input_image = _extract_first_image(doc.get("test_images"))
82+
83+
if stacked_train_image is None:
84+
eval_logger.warning("Missing stacked_train_image for id={}", doc.get("id", "unknown"))
85+
if test_input_image is None:
86+
eval_logger.warning("Missing test_images for id={}", doc.get("id", "unknown"))
87+
88+
if stacked_train_image is None or test_input_image is None:
89+
raise ValueError(f"ARC-AGI-1 visual inputs are incomplete for id={doc.get('id', 'unknown')}")
90+
91+
return [stacked_train_image, test_input_image]
92+
93+
94+
def arc_agi_1_doc_to_text(doc, lmms_eval_specific_kwargs=None):
95+
if lmms_eval_specific_kwargs is None:
96+
lmms_eval_specific_kwargs = {}
97+
98+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
99+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
100+
101+
test_input_text = _extract_first_value(doc.get("test_inputs"), default="")
102+
103+
prompt = (
104+
"You are given demonstration input-output pairs as images. "
105+
"Based on the pattern, predict the output grid for the test input.\n"
106+
"The first image contains all demonstration input/output pairs.\n"
107+
"The second image contains the test input grid.\n\n"
108+
f"Text representation of the test input grid:\n{test_input_text}\n\n"
109+
"Output only the predicted output grid as a JSON array of arrays of integers.\n"
110+
"Example: [[1,2],[3,4]]"
111+
)
112+
113+
return f"{pre_prompt}{prompt}{post_prompt}"
114+
115+
116+
def arc_agi_1_doc_to_target(doc):
117+
target = _extract_first_value(doc.get("raw_solution"), default=[])
118+
return json.dumps(target)
119+
120+
121+
def _parse_grid_from_response(text: str) -> list[list[int]] | None:
122+
if not isinstance(text, str):
123+
return None
124+
125+
text = text.strip()
126+
if not text:
127+
return None
128+
129+
parsed_grid = _try_parse_grid(text)
130+
if parsed_grid is not None:
131+
return parsed_grid
132+
133+
for block in CODE_FENCE_PATTERN.findall(text):
134+
parsed_grid = _try_parse_grid(block)
135+
if parsed_grid is not None:
136+
return parsed_grid
137+
138+
for match in GRID_PATTERN.findall(text):
139+
parsed_grid = _try_parse_grid(match)
140+
if parsed_grid is not None:
141+
return parsed_grid
142+
143+
return None
144+
145+
146+
def arc_agi_1_process_results(doc, results):
147+
response_text = results[0] if results else ""
148+
if not isinstance(response_text, str):
149+
response_text = str(response_text)
150+
151+
parsed_grid = _parse_grid_from_response(response_text)
152+
target_grid = _get_raw_solution_grid(doc)
153+
154+
if parsed_grid is None:
155+
eval_logger.debug("Failed to parse model output as ARC grid for id={}", doc.get("id", "unknown"))
156+
return {"arc_agi_1_acc": 0.0}
157+
158+
if target_grid is None:
159+
return {"arc_agi_1_acc": 0.0}
160+
161+
return {"arc_agi_1_acc": 1.0 if parsed_grid == target_grid else 0.0}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# ARC-AGI-2 (Image)
2+
3+
ARC-AGI-2 visual reasoning benchmark using community-rendered image dataset.
4+
5+
- Original: https://github.com/arcprize/ARC-AGI-2
6+
- Dataset: https://huggingface.co/datasets/vincentkoc/arc-agi-2
7+
- Note: Uses community image conversion (not official ARC Prize packaging)
8+
9+
## Usage
10+
```bash
11+
python -m lmms_eval --tasks arc_agi_2 --model <model_name> --batch_size 1
12+
```
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
dataset_path: vincentkoc/arc-agi-2
2+
dataset_kwargs:
3+
token: false
4+
output_type: generate_until
5+
doc_to_visual: !function utils.arc_agi_2_doc_to_visual
6+
doc_to_text: !function utils.arc_agi_2_doc_to_text
7+
doc_to_target: !function utils.arc_agi_2_doc_to_target
8+
generation_kwargs:
9+
max_new_tokens: 1024
10+
temperature: 0
11+
top_p: 1.0
12+
do_sample: false
13+
process_results: !function utils.arc_agi_2_process_results
14+
metric_list:
15+
- metric: arc_agi_2_acc
16+
aggregation: mean
17+
higher_is_better: true
18+
lmms_eval_specific_kwargs:
19+
default:
20+
pre_prompt: ""
21+
post_prompt: ""
22+
metadata:
23+
version: 0.0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
include: _default_template_yaml
2+
task: arc_agi_2
3+
test_split: evaluation

0 commit comments

Comments
 (0)