Skip to content

Commit f2b55c3

Browse files
Super UserSuper User
authored andcommitted
feat(omnidocbench): add normalized Levenshtein distance metric
Add omnidocbench_nld_score metric computed as (1 - NLD) * 100, following the Kimi K2.5 technical report scoring method. The existing exact_match metric is preserved alongside the new one.
1 parent 11a3601 commit f2b55c3

2 files changed

Lines changed: 29 additions & 4 deletions

File tree

lmms_eval/tasks/omnidocbench/omnidocbench.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ metric_list:
2020
- metric: omnidocbench_exact_match
2121
aggregation: mean
2222
higher_is_better: true
23+
- metric: omnidocbench_nld_score
24+
aggregation: mean
25+
higher_is_better: true
2326
lmms_eval_specific_kwargs:
2427
default:
2528
pre_prompt: ""

lmms_eval/tasks/omnidocbench/utils.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33
from typing import Any
44

5+
import Levenshtein
56
from PIL import Image
67

78

@@ -96,21 +97,42 @@ def omnidocbench_doc_to_target(doc):
9697
return answers[0] if answers else ""
9798

9899

100+
def _normalized_levenshtein_score(pred: str, ref: str) -> float:
101+
"""Compute (1 - normalized_levenshtein_distance) * 100.
102+
103+
Following the Kimi K2.5 technical report metric.
104+
"""
105+
if not pred and not ref:
106+
return 100.0
107+
max_len = max(len(pred), len(ref))
108+
if max_len == 0:
109+
return 100.0
110+
dist = Levenshtein.distance(pred, ref)
111+
return (1.0 - dist / max_len) * 100.0
112+
113+
99114
def omnidocbench_process_results(doc, results):
100115
prediction = _normalize_text(results[0])
101116
answers = _extract_answers(doc)
102117
if not answers:
103-
return {"omnidocbench_exact_match": 0.0}
118+
return {"omnidocbench_exact_match": 0.0, "omnidocbench_nld_score": 0.0}
104119

120+
# Exact match
105121
answer_set = {_normalize_text(answer) for answer in answers}
106-
score = float(prediction in answer_set)
122+
em_score = float(prediction in answer_set)
107123

108124
options = _extract_options(doc)
109125
if options:
110126
pred_letter = _extract_option_letter(str(results[0]))
111127
if pred_letter:
112128
for answer in answers:
113129
if pred_letter == answer.strip().upper()[:1]:
114-
score = max(score, 1.0)
130+
em_score = max(em_score, 1.0)
131+
132+
# Normalized Levenshtein score: (1 - NLD) * 100, take best across answers
133+
nld_score = max(
134+
_normalized_levenshtein_score(prediction, _normalize_text(answer))
135+
for answer in answers
136+
)
115137

116-
return {"omnidocbench_exact_match": score}
138+
return {"omnidocbench_exact_match": em_score, "omnidocbench_nld_score": nld_score}

0 commit comments

Comments
 (0)