Skip to content

Commit 79ddef8

Browse files
authored
feat: integrate OfficeQA benchmark task (#1150)
1 parent bb846e1 commit 79ddef8

3 files changed

Lines changed: 184 additions & 0 deletions

File tree

docs/current_tasks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ python -m lmms_eval --tasks list_with_num
403403
- [Infographic VQA](https://www.docvqa.org/datasets/infographicvqa) (infovqa)
404404
- Infographic VQA Validation (infovqa_val)
405405
- Infographic VQA Test (infovqa_test)
406+
- [OfficeQA](https://github.com/databricks/officeqa) (officeqa)
406407
- [OCRBench](https://github.com/Yuliang-Liu/MultimodalOCR) (ocrbench)
407408
- [OCRBench v2](https://github.com/Yuliang-Liu/MultimodalOCR) (ocrbench_v2)
408409
- [OmniDocBench](https://github.com/opendatalab/OmniDocBench) (omnidocbench)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
dataset_path: csv
2+
dataset_kwargs:
3+
data_files: https://raw.githubusercontent.com/databricks/officeqa/main/officeqa.csv
4+
task: officeqa
5+
test_split: train
6+
output_type: generate_until
7+
doc_to_visual: !function utils.officeqa_doc_to_visual
8+
doc_to_text: !function utils.officeqa_doc_to_text
9+
doc_to_target: !function utils.officeqa_doc_to_target
10+
generation_kwargs:
11+
max_new_tokens: 64
12+
temperature: 0
13+
do_sample: false
14+
process_results: !function utils.officeqa_process_results
15+
metric_list:
16+
- metric: officeqa_relaxed_accuracy
17+
aggregation: mean
18+
higher_is_better: true
19+
lmms_eval_specific_kwargs:
20+
default:
21+
pre_prompt: ""
22+
post_prompt: "\nAnswer with a single word, phrase, or option letter."
23+
qwen_vl:
24+
pre_prompt: ""
25+
post_prompt: " Answer:"
26+
metadata:
27+
- version: 0.0

lmms_eval/tasks/officeqa/utils.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import io
2+
import re
3+
from typing import Any
4+
5+
from PIL import Image
6+
7+
8+
def _normalize_text(text: Any) -> str:
9+
if text is None:
10+
return ""
11+
return " ".join(str(text).strip().lower().split())
12+
13+
14+
def _to_float(text: str):
15+
candidate = text.strip().replace(",", "")
16+
if candidate.endswith("%"):
17+
try:
18+
return float(candidate[:-1]) / 100.0
19+
except ValueError:
20+
return None
21+
22+
match = re.match(r"^[-+]?\d*\.?\d+$", candidate)
23+
if not match:
24+
return None
25+
26+
try:
27+
return float(candidate)
28+
except ValueError:
29+
return None
30+
31+
32+
def _relaxed_match(prediction: str, target: str, max_relative_change: float = 0.05) -> float:
33+
pred = _normalize_text(prediction)
34+
tgt = _normalize_text(target)
35+
if not pred or not tgt:
36+
return 0.0
37+
38+
pred_float = _to_float(pred)
39+
tgt_float = _to_float(tgt)
40+
41+
if pred_float is not None and tgt_float is not None:
42+
if tgt_float == 0:
43+
return float(pred_float == 0)
44+
return float(abs(pred_float - tgt_float) / abs(tgt_float) <= max_relative_change)
45+
46+
return float(pred == tgt)
47+
48+
49+
def _to_rgb(image_obj: Any):
50+
if isinstance(image_obj, Image.Image):
51+
return image_obj.convert("RGB")
52+
53+
if isinstance(image_obj, bytes):
54+
return Image.open(io.BytesIO(image_obj)).convert("RGB")
55+
56+
return None
57+
58+
59+
def _extract_question(doc: dict) -> str:
60+
for key in ["question", "query", "prompt", "instruction"]:
61+
value = doc.get(key)
62+
if isinstance(value, str) and value.strip():
63+
return value.strip()
64+
return ""
65+
66+
67+
def _extract_options(doc: dict) -> list[str]:
68+
options = doc.get("options", doc.get("choices"))
69+
if isinstance(options, list):
70+
normalized = []
71+
for item in options:
72+
if isinstance(item, dict):
73+
text = item.get("text", item.get("option", ""))
74+
normalized.append(str(text))
75+
else:
76+
normalized.append(str(item))
77+
return [x for x in normalized if x.strip()]
78+
return []
79+
80+
81+
def _extract_answers(doc: dict) -> list[str]:
82+
answers = doc.get("answers", doc.get("answer", doc.get("target")))
83+
if answers is None:
84+
return []
85+
if isinstance(answers, list):
86+
return [str(item) for item in answers if str(item).strip()]
87+
return [str(answers)]
88+
89+
90+
def _extract_option_letter(prediction: str) -> str:
91+
normalized = prediction.strip().upper()
92+
match = re.search(r"\b([A-Z])\b", normalized)
93+
if match:
94+
return match.group(1)
95+
return ""
96+
97+
98+
def officeqa_doc_to_visual(doc):
99+
visuals = []
100+
101+
for key in ["image", "page_image", "document_image"]:
102+
if key in doc:
103+
img = _to_rgb(doc[key])
104+
if img is not None:
105+
visuals.append(img)
106+
107+
for key in ["images", "page_images", "document_images", "pages"]:
108+
value = doc.get(key)
109+
if isinstance(value, list):
110+
for item in value:
111+
img = _to_rgb(item)
112+
if img is not None:
113+
visuals.append(img)
114+
115+
return visuals
116+
117+
118+
def officeqa_doc_to_text(doc, lmms_eval_specific_kwargs=None):
119+
kwargs = lmms_eval_specific_kwargs or {}
120+
pre_prompt = kwargs.get("pre_prompt", "")
121+
post_prompt = kwargs.get("post_prompt", "")
122+
123+
question = _extract_question(doc)
124+
options = _extract_options(doc)
125+
if not options:
126+
return f"{pre_prompt}{question}{post_prompt}"
127+
128+
option_labels = [chr(ord("A") + idx) for idx in range(len(options))]
129+
option_lines = "\n".join(f"{label}. {choice}" for label, choice in zip(option_labels, options))
130+
return f"{pre_prompt}{question}\n{option_lines}{post_prompt}"
131+
132+
133+
def officeqa_doc_to_target(doc):
134+
answers = _extract_answers(doc)
135+
return answers[0] if answers else ""
136+
137+
138+
def officeqa_process_results(doc, results):
139+
prediction = str(results[0]).strip()
140+
answers = _extract_answers(doc)
141+
142+
if not answers:
143+
return {"officeqa_relaxed_accuracy": 0.0}
144+
145+
best_score = max(_relaxed_match(prediction, answer) for answer in answers)
146+
147+
options = _extract_options(doc)
148+
if options:
149+
pred_letter = _extract_option_letter(prediction)
150+
if pred_letter:
151+
for answer in answers:
152+
answer_norm = answer.strip().upper()
153+
if pred_letter == answer_norm[:1]:
154+
best_score = max(best_score, 1.0)
155+
156+
return {"officeqa_relaxed_accuracy": best_score}

0 commit comments

Comments
 (0)