Skip to content

Commit 05ed5a7

Browse files
authored
feat: integrate six traceable benchmarks with unified smoke test (#1202)
* refactor: remove dead read_video_pyav_pil and deduplicate _resize_image in load_video * refactor: rename read_video_pyav -> read_video, remove dead code - Rename read_video_pyav to read_video in load_video.py with backward-compat alias - Delete _resize_image and read_video_pyav_base64 dead functions - Update all 12 caller files to use read_video directly - Inline base64 encoding logic in qwen2_5_omni.py (was read_video_pyav_base64) - Fix missing import in vila.py (latent bug) - Remove use_custom_video_loader dead code from 5 models that declared but never checked it (qwen2_5_vl, qwen3_vl, qwen3_omni, llava_onevision1_5, huggingface) * docs: rewrite Section 7.1 to document read_video backends, remove dead Section 7.2 * feat: unified CLI with subcommand dispatch and interactive wizard Add lmms_eval/cli/ package with subcommand-based architecture: eval - run evaluation (wizard mode when no args) tasks - list/groups/subtasks/tags browser models - list backends with optional --aliases ui - launch Web UI serve - start HTTP eval server power - statistical power analysis version - version and environment info tui - terminal UI (textual) Full backward compat: lmms-eval --model X --tasks Y still works. Entrypoint rewired through cli.dispatch:main in pyproject.toml. * docs: add external usage guide for CLI and library access Add docs/external_usage.md covering CLI subcommands (tasks, models, eval wizard, ui, serve, power, version) and Python library usage (TaskManager, datasets, evaluator, metrics). Update docs index link. Polish v0.7 release notes for consistency. * feat(tasks): add six benchmark tasks and unified smoke report * fix(smoke): enable audio payloads for openrouter omni runs * fix(smoke): use 1fps video sampling for api smoke runs * fix(multimodal): correct audio routing and video fps sampling * test(cli): add dispatch and task pipeline coverage
1 parent 94da674 commit 05ed5a7

20 files changed

Lines changed: 672 additions & 14 deletions

File tree

lmms_eval/api/task.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1692,6 +1692,7 @@ def auto_doc_to_messages(doc):
16921692
messages = [{"role": "user", "content": []}]
16931693
content = []
16941694
_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"}
1695+
_AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".aac", ".flac", ".ogg", ".opus", ".webm"}
16951696
for visual in visuals:
16961697
if isinstance(visual, PIL_Image.Image):
16971698
content.append({"type": "image", "url": visual})
@@ -1701,6 +1702,8 @@ def auto_doc_to_messages(doc):
17011702
ext = os.path.splitext(visual)[1].lower()
17021703
if ext in _IMAGE_EXTS:
17031704
content.append({"type": "image", "url": visual})
1705+
elif ext in _AUDIO_EXTS:
1706+
content.append({"type": "audio", "url": visual})
17041707
else:
17051708
content.append({"type": "video", "url": visual})
17061709
content.append({"type": "text", "text": text})

lmms_eval/models/chat/openai.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,13 @@ def build_payload_for_index(global_index: int) -> dict:
180180
max_new_tokens = min(request_gen_kwargs.get("max_new_tokens", 1024), 4096)
181181
temperature = request_gen_kwargs.get("temperature", 0)
182182

183+
if self.video_fps is not None and self.video_fps > 0:
184+
video_kwargs = {"fps": self.video_fps}
185+
else:
186+
video_kwargs = {"nframes": self.max_frames_num}
187+
183188
payload = {
184-
"messages": chat_messages.to_openai_messages(video_kwargs={"nframes": self.max_frames_num}),
189+
"messages": chat_messages.to_openai_messages(video_kwargs=video_kwargs),
185190
"model": self.model_version,
186191
"max_tokens": max_new_tokens,
187192
"temperature": temperature,

lmms_eval/models/simple/openai.py

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import os
23
import time
34
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
@@ -53,6 +54,7 @@ def __init__(
5354
max_size_in_mb: int = 20,
5455
azure_openai: bool = False,
5556
max_frames_num: int = 10,
57+
video_fps: Optional[float] = None,
5658
httpx_trust_env: bool = True,
5759
batch_size: int = 64,
5860
num_concurrent: int = 32,
@@ -81,6 +83,7 @@ def __init__(
8183
self.max_retries = max_retries
8284
self.max_size_in_mb = max_size_in_mb # some models have a limit on the size of the image
8385
self.max_frames_num = max_frames_num
86+
self.video_fps = float(video_fps) if video_fps is not None else None
8487
self.num_concurrent = max(1, int(num_concurrent))
8588
self.adaptive_concurrency = parse_bool(adaptive_concurrency)
8689
self.adaptive_config = AdaptiveConcurrencyConfig.from_raw(
@@ -200,13 +203,32 @@ def encode_image(self, image: Union[Image.Image, str]):
200203
def encode_video(self, video_path, for_get_frames_num):
201204
vr = VideoReader(video_path, ctx=cpu(0))
202205
total_frame_num = len(vr)
203-
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
204-
205-
# Ensure the last frame is included
206-
if total_frame_num - 1 not in uniform_sampled_frames:
207-
uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1)
206+
if total_frame_num <= 0:
207+
return []
208208

209-
frame_idx = uniform_sampled_frames.tolist()
209+
frame_idx = []
210+
211+
if self.video_fps is not None and self.video_fps > 0:
212+
source_fps = float(vr.get_avg_fps()) if hasattr(vr, "get_avg_fps") else 0.0
213+
if source_fps > 0:
214+
step = max(1, int(round(source_fps / self.video_fps)))
215+
frame_idx = list(range(0, total_frame_num, step))
216+
if frame_idx and frame_idx[-1] != total_frame_num - 1:
217+
frame_idx.append(total_frame_num - 1)
218+
219+
if not frame_idx:
220+
sample_count = min(max(1, int(for_get_frames_num)), total_frame_num)
221+
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_count, dtype=int)
222+
if total_frame_num - 1 not in uniform_sampled_frames:
223+
uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1)
224+
frame_idx = uniform_sampled_frames.tolist()
225+
elif for_get_frames_num and len(frame_idx) > int(for_get_frames_num):
226+
keep = np.linspace(0, len(frame_idx) - 1, int(for_get_frames_num), dtype=int)
227+
frame_idx = [frame_idx[i] for i in keep]
228+
if frame_idx[-1] != total_frame_num - 1:
229+
frame_idx.append(total_frame_num - 1)
230+
231+
frame_idx = sorted(set(frame_idx))
210232
frames = vr.get_batch(frame_idx).asnumpy()
211233

212234
base64_frames = []
@@ -223,6 +245,13 @@ def encode_video(self, video_path, for_get_frames_num):
223245

224246
return base64_frames
225247

248+
def encode_audio_file(self, audio_path: str):
249+
ext = os.path.splitext(audio_path)[1].lower().lstrip(".")
250+
audio_format = ext if ext in {"wav", "mp3", "flac", "aac", "ogg", "m4a"} else "wav"
251+
with open(audio_path, "rb") as handle:
252+
audio_b64 = base64.b64encode(handle.read()).decode("utf-8")
253+
return audio_b64, audio_format
254+
226255
def flatten(self, input):
227256
new_list = []
228257
for i in input:
@@ -370,9 +399,12 @@ def build_payload_for_index(global_index: int):
370399
visuals = self.flatten(visuals)
371400
imgs = []
372401
for visual in visuals:
373-
if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
402+
if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual or ".webm" in visual or ".mkv" in visual):
374403
frames = self.encode_video(visual, self.max_frames_num)
375404
imgs.extend(frames)
405+
elif isinstance(visual, str) and (".wav" in visual or ".mp3" in visual or ".flac" in visual or ".aac" in visual or ".ogg" in visual or ".m4a" in visual):
406+
audio_b64, audio_format = self.encode_audio_file(visual)
407+
imgs.append({"audio_b64": audio_b64, "audio_format": audio_format})
376408
elif isinstance(visual, str) and (".jpg" in visual or ".jpeg" in visual or ".png" in visual or ".gif" in visual or ".bmp" in visual or ".tiff" in visual or ".webp" in visual):
377409
imgs.append(self.encode_image(visual))
378410
elif isinstance(visual, Image.Image):
@@ -390,12 +422,20 @@ def build_payload_for_index(global_index: int):
390422
}
391423
payload["messages"][0]["content"].append({"type": "text", "text": context})
392424
for img in imgs:
393-
payload["messages"][0]["content"].append(
394-
{
395-
"type": "image_url",
396-
"image_url": {"url": f"data:image/png;base64,{img}"},
397-
}
398-
)
425+
if isinstance(img, dict) and "audio_b64" in img:
426+
payload["messages"][0]["content"].append(
427+
{
428+
"type": "input_audio",
429+
"input_audio": {"data": img["audio_b64"], "format": img["audio_format"]},
430+
}
431+
)
432+
else:
433+
payload["messages"][0]["content"].append(
434+
{
435+
"type": "image_url",
436+
"image_url": {"url": f"data:image/png;base64,{img}"},
437+
}
438+
)
399439

400440
if "o1" in self.model_version or "o3" in self.model_version:
401441
payload.pop("temperature")

lmms_eval/tasks/av_asr/av_asr.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
dataset_path: json
2+
dataset_kwargs:
3+
data_files:
4+
test: data/av_asr_test.json
5+
task: av_asr
6+
test_split: test
7+
output_type: generate_until
8+
doc_to_visual: !function utils.av_asr_doc_to_visual
9+
doc_to_text: !function utils.av_asr_doc_to_text
10+
doc_to_target: !function utils.av_asr_doc_to_target
11+
generation_kwargs:
12+
max_new_tokens: 256
13+
temperature: 0
14+
do_sample: false
15+
process_results: !function utils.av_asr_process_results
16+
metric_list:
17+
- metric: wer
18+
aggregation: !function utils.av_asr_wer
19+
higher_is_better: false
20+
lmms_eval_specific_kwargs:
21+
default:
22+
pre_prompt: ""
23+
post_prompt: ""
24+
metadata:
25+
- version: 0.0

lmms_eval/tasks/av_asr/utils.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import re
2+
3+
4+
def _normalize(text):
5+
lowered = str(text or "").strip().lower()
6+
lowered = re.sub(r"[^a-z0-9\s']", " ", lowered)
7+
lowered = re.sub(r"\s+", " ", lowered)
8+
return lowered.strip()
9+
10+
11+
def _word_error_rate(reference, hypothesis):
12+
ref_words = _normalize(reference).split()
13+
hyp_words = _normalize(hypothesis).split()
14+
if not ref_words:
15+
return 0.0 if not hyp_words else 1.0
16+
17+
rows = len(ref_words) + 1
18+
cols = len(hyp_words) + 1
19+
dp = [[0] * cols for _ in range(rows)]
20+
21+
for i in range(rows):
22+
dp[i][0] = i
23+
for j in range(cols):
24+
dp[0][j] = j
25+
26+
for i in range(1, rows):
27+
for j in range(1, cols):
28+
cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
29+
dp[i][j] = min(
30+
dp[i - 1][j] + 1,
31+
dp[i][j - 1] + 1,
32+
dp[i - 1][j - 1] + cost,
33+
)
34+
35+
return dp[-1][-1] / len(ref_words)
36+
37+
38+
def av_asr_doc_to_visual(doc):
39+
visuals = []
40+
for key in ["audio", "audio_path"]:
41+
value = doc.get(key)
42+
if value:
43+
visuals.append(value)
44+
break
45+
for key in ["video", "video_path", "file", "path"]:
46+
value = doc.get(key)
47+
if value:
48+
visuals.append(value)
49+
break
50+
return visuals
51+
52+
53+
def av_asr_doc_to_text(doc, lmms_eval_specific_kwargs=None):
54+
kwargs = lmms_eval_specific_kwargs or {}
55+
pre_prompt = kwargs.get("pre_prompt", "")
56+
post_prompt = kwargs.get("post_prompt", "")
57+
question = str(doc.get("question", "Transcribe the speech in this video.")).strip()
58+
return f"{pre_prompt}{question}{post_prompt}"
59+
60+
61+
def av_asr_doc_to_target(doc):
62+
for key in ["text", "transcript", "gt", "answer"]:
63+
value = doc.get(key)
64+
if value is not None:
65+
return str(value)
66+
return ""
67+
68+
69+
def av_asr_process_results(doc, results):
70+
prediction = results[0] if results else ""
71+
target = av_asr_doc_to_target(doc)
72+
return {"wer": {"gt": target, "pred": prediction}}
73+
74+
75+
def av_asr_wer(items):
76+
if not items:
77+
return 0.0
78+
total = 0.0
79+
for item in items:
80+
total += _word_error_rate(item.get("gt", ""), item.get("pred", ""))
81+
return 100.0 * total / len(items)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
group: anet_qa
2+
task:
3+
- activitynetqa
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
group: egosch_a
2+
task:
3+
- egoschema
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
group: mmmu_a
2+
task:
3+
- mmmu_val
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
dataset_path: json
2+
dataset_kwargs:
3+
data_files:
4+
test: data/countix_test.json
5+
task: countix
6+
test_split: test
7+
output_type: generate_until
8+
doc_to_visual: !function utils.countix_doc_to_visual
9+
doc_to_text: !function utils.countix_doc_to_text
10+
doc_to_target: !function utils.countix_doc_to_target
11+
generation_kwargs:
12+
max_new_tokens: 16
13+
temperature: 0
14+
do_sample: false
15+
process_results: !function utils.countix_process_results
16+
metric_list:
17+
- metric: mae_norm
18+
aggregation: mean
19+
higher_is_better: false
20+
- metric: obo
21+
aggregation: mean
22+
higher_is_better: true
23+
lmms_eval_specific_kwargs:
24+
default:
25+
pre_prompt: ""
26+
post_prompt: "\nAnswer with a single integer."
27+
metadata:
28+
- version: 0.0

lmms_eval/tasks/countix/utils.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import re
2+
3+
4+
def _extract_count(value):
5+
if value is None:
6+
return None
7+
text = str(value).strip().lower().replace(",", "")
8+
match = re.search(r"-?\d+(?:\.\d+)?", text)
9+
if not match:
10+
return None
11+
return int(round(float(match.group(0))))
12+
13+
14+
def _get_target_count(doc):
15+
for key in ["count", "answer", "number", "gt_count", "label"]:
16+
target = _extract_count(doc.get(key))
17+
if target is not None:
18+
return target
19+
return None
20+
21+
22+
def countix_doc_to_visual(doc):
23+
for key in ["video", "video_path", "image", "img", "file", "path"]:
24+
value = doc.get(key)
25+
if value:
26+
return [value]
27+
return []
28+
29+
30+
def countix_doc_to_text(doc, lmms_eval_specific_kwargs=None):
31+
kwargs = lmms_eval_specific_kwargs or {}
32+
pre_prompt = kwargs.get("pre_prompt", "")
33+
post_prompt = kwargs.get("post_prompt", "")
34+
question = str(doc.get("question", "Count the number of repetitions in this clip.")).strip()
35+
return f"{pre_prompt}{question}{post_prompt}"
36+
37+
38+
def countix_doc_to_target(doc):
39+
target = _get_target_count(doc)
40+
return "" if target is None else str(target)
41+
42+
43+
def countix_process_results(doc, results):
44+
prediction = results[0] if results else ""
45+
pred_count = _extract_count(prediction)
46+
target_count = _get_target_count(doc)
47+
48+
if pred_count is None or target_count is None:
49+
return {"mae_norm": 0.0 if target_count is None else float(abs(target_count)), "obo": 0.0}
50+
51+
mae_norm = abs(pred_count - target_count) / (target_count + 0.1)
52+
obo = float(abs(pred_count - target_count) <= 1)
53+
return {"mae_norm": float(mae_norm), "obo": obo}

0 commit comments

Comments
 (0)