EvolvingLMMs-Lab
diff --git a/‎lmms_eval/api/task.py‎
Lines changed: 3 additions & 0 deletions b/‎lmms_eval/api/task.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmms_eval/models/chat/openai.py‎
Lines changed: 6 additions & 1 deletion b/‎lmms_eval/models/chat/openai.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lmms_eval/models/simple/openai.py‎
Lines changed: 53 additions & 13 deletions b/‎lmms_eval/models/simple/openai.py‎
Lines changed: 53 additions & 13 deletions
diff --git a/‎lmms_eval/tasks/av_asr/av_asr.yaml‎
Lines changed: 25 additions & 0 deletions b/‎lmms_eval/tasks/av_asr/av_asr.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/av_asr/utils.py‎
Lines changed: 81 additions & 0 deletions b/‎lmms_eval/tasks/av_asr/utils.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/benchmark_aliases/anet_qa.yaml‎
Lines changed: 3 additions & 0 deletions b/‎lmms_eval/tasks/benchmark_aliases/anet_qa.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/benchmark_aliases/egosch_a.yaml‎
Lines changed: 3 additions & 0 deletions b/‎lmms_eval/tasks/benchmark_aliases/egosch_a.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/benchmark_aliases/mmmu_a.yaml‎
Lines changed: 3 additions & 0 deletions b/‎lmms_eval/tasks/benchmark_aliases/mmmu_a.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/countix/countix.yaml‎
Lines changed: 28 additions & 0 deletions b/‎lmms_eval/tasks/countix/countix.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/countix/utils.py‎
Lines changed: 53 additions & 0 deletions b/‎lmms_eval/tasks/countix/utils.py‎
Lines changed: 53 additions & 0 deletions
@@ -1692,6 +1692,7 @@ def auto_doc_to_messages(doc):
                 messages = [{"role": "user", "content": []}]
                 content = []
                 _IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"}
+                _AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".aac", ".flac", ".ogg", ".opus", ".webm"}
                 for visual in visuals:
                     if isinstance(visual, PIL_Image.Image):
                         content.append({"type": "image", "url": visual})
@@ -1701,6 +1702,8 @@ def auto_doc_to_messages(doc):
                         ext = os.path.splitext(visual)[1].lower()
                         if ext in _IMAGE_EXTS:
                             content.append({"type": "image", "url": visual})
+                        elif ext in _AUDIO_EXTS:
+                            content.append({"type": "audio", "url": visual})
                         else:
                             content.append({"type": "video", "url": visual})
                 content.append({"type": "text", "text": text})
 
@@ -180,8 +180,13 @@ def build_payload_for_index(global_index: int) -> dict:
             max_new_tokens = min(request_gen_kwargs.get("max_new_tokens", 1024), 4096)
             temperature = request_gen_kwargs.get("temperature", 0)
 
+            if self.video_fps is not None and self.video_fps > 0:
+                video_kwargs = {"fps": self.video_fps}
+            else:
+                video_kwargs = {"nframes": self.max_frames_num}
+
             payload = {
-                "messages": chat_messages.to_openai_messages(video_kwargs={"nframes": self.max_frames_num}),
+                "messages": chat_messages.to_openai_messages(video_kwargs=video_kwargs),
                 "model": self.model_version,
                 "max_tokens": max_new_tokens,
                 "temperature": temperature,
 
@@ -1,3 +1,4 @@
+import base64
 import os
 import time
 from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
@@ -53,6 +54,7 @@ def __init__(
         max_size_in_mb: int = 20,
         azure_openai: bool = False,
         max_frames_num: int = 10,
+        video_fps: Optional[float] = None,
         httpx_trust_env: bool = True,
         batch_size: int = 64,
         num_concurrent: int = 32,
@@ -81,6 +83,7 @@ def __init__(
         self.max_retries = max_retries
         self.max_size_in_mb = max_size_in_mb  # some models have a limit on the size of the image
         self.max_frames_num = max_frames_num
+        self.video_fps = float(video_fps) if video_fps is not None else None
         self.num_concurrent = max(1, int(num_concurrent))
         self.adaptive_concurrency = parse_bool(adaptive_concurrency)
         self.adaptive_config = AdaptiveConcurrencyConfig.from_raw(
@@ -200,13 +203,32 @@ def encode_image(self, image: Union[Image.Image, str]):
     def encode_video(self, video_path, for_get_frames_num):
         vr = VideoReader(video_path, ctx=cpu(0))
         total_frame_num = len(vr)
-        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
-
-        # Ensure the last frame is included
-        if total_frame_num - 1 not in uniform_sampled_frames:
-            uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1)
+        if total_frame_num <= 0:
+            return []
 
-        frame_idx = uniform_sampled_frames.tolist()
+        frame_idx = []
+
+        if self.video_fps is not None and self.video_fps > 0:
+            source_fps = float(vr.get_avg_fps()) if hasattr(vr, "get_avg_fps") else 0.0
+            if source_fps > 0:
+                step = max(1, int(round(source_fps / self.video_fps)))
+                frame_idx = list(range(0, total_frame_num, step))
+                if frame_idx and frame_idx[-1] != total_frame_num - 1:
+                    frame_idx.append(total_frame_num - 1)
+
+        if not frame_idx:
+            sample_count = min(max(1, int(for_get_frames_num)), total_frame_num)
+            uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_count, dtype=int)
+            if total_frame_num - 1 not in uniform_sampled_frames:
+                uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1)
+            frame_idx = uniform_sampled_frames.tolist()
+        elif for_get_frames_num and len(frame_idx) > int(for_get_frames_num):
+            keep = np.linspace(0, len(frame_idx) - 1, int(for_get_frames_num), dtype=int)
+            frame_idx = [frame_idx[i] for i in keep]
+            if frame_idx[-1] != total_frame_num - 1:
+                frame_idx.append(total_frame_num - 1)
+
+        frame_idx = sorted(set(frame_idx))
         frames = vr.get_batch(frame_idx).asnumpy()
 
         base64_frames = []
@@ -223,6 +245,13 @@ def encode_video(self, video_path, for_get_frames_num):
 
         return base64_frames
 
+    def encode_audio_file(self, audio_path: str):
+        ext = os.path.splitext(audio_path)[1].lower().lstrip(".")
+        audio_format = ext if ext in {"wav", "mp3", "flac", "aac", "ogg", "m4a"} else "wav"
+        with open(audio_path, "rb") as handle:
+            audio_b64 = base64.b64encode(handle.read()).decode("utf-8")
+        return audio_b64, audio_format
+
     def flatten(self, input):
         new_list = []
         for i in input:
@@ -370,9 +399,12 @@ def build_payload_for_index(global_index: int):
                 visuals = self.flatten(visuals)
                 imgs = []
                 for visual in visuals:
-                    if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
+                    if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual or ".webm" in visual or ".mkv" in visual):
                         frames = self.encode_video(visual, self.max_frames_num)
                         imgs.extend(frames)
+                    elif isinstance(visual, str) and (".wav" in visual or ".mp3" in visual or ".flac" in visual or ".aac" in visual or ".ogg" in visual or ".m4a" in visual):
+                        audio_b64, audio_format = self.encode_audio_file(visual)
+                        imgs.append({"audio_b64": audio_b64, "audio_format": audio_format})
                     elif isinstance(visual, str) and (".jpg" in visual or ".jpeg" in visual or ".png" in visual or ".gif" in visual or ".bmp" in visual or ".tiff" in visual or ".webp" in visual):
                         imgs.append(self.encode_image(visual))
                     elif isinstance(visual, Image.Image):
@@ -390,12 +422,20 @@ def build_payload_for_index(global_index: int):
             }
             payload["messages"][0]["content"].append({"type": "text", "text": context})
             for img in imgs:
-                payload["messages"][0]["content"].append(
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/png;base64,{img}"},
-                    }
-                )
+                if isinstance(img, dict) and "audio_b64" in img:
+                    payload["messages"][0]["content"].append(
+                        {
+                            "type": "input_audio",
+                            "input_audio": {"data": img["audio_b64"], "format": img["audio_format"]},
+                        }
+                    )
+                else:
+                    payload["messages"][0]["content"].append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{img}"},
+                        }
+                    )
 
             if "o1" in self.model_version or "o3" in self.model_version:
                 payload.pop("temperature")
 
@@ -0,0 +1,25 @@
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: data/av_asr_test.json
+task: av_asr
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.av_asr_doc_to_visual
+doc_to_text: !function utils.av_asr_doc_to_text
+doc_to_target: !function utils.av_asr_doc_to_target
+generation_kwargs:
+  max_new_tokens: 256
+  temperature: 0
+  do_sample: false
+process_results: !function utils.av_asr_process_results
+metric_list:
+  - metric: wer
+    aggregation: !function utils.av_asr_wer
+    higher_is_better: false
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+metadata:
+  - version: 0.0
@@ -0,0 +1,81 @@
+import re
+
+
+def _normalize(text):
+    lowered = str(text or "").strip().lower()
+    lowered = re.sub(r"[^a-z0-9\s']", " ", lowered)
+    lowered = re.sub(r"\s+", " ", lowered)
+    return lowered.strip()
+
+
+def _word_error_rate(reference, hypothesis):
+    ref_words = _normalize(reference).split()
+    hyp_words = _normalize(hypothesis).split()
+    if not ref_words:
+        return 0.0 if not hyp_words else 1.0
+
+    rows = len(ref_words) + 1
+    cols = len(hyp_words) + 1
+    dp = [[0] * cols for _ in range(rows)]
+
+    for i in range(rows):
+        dp[i][0] = i
+    for j in range(cols):
+        dp[0][j] = j
+
+    for i in range(1, rows):
+        for j in range(1, cols):
+            cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,
+                dp[i][j - 1] + 1,
+                dp[i - 1][j - 1] + cost,
+            )
+
+    return dp[-1][-1] / len(ref_words)
+
+
+def av_asr_doc_to_visual(doc):
+    visuals = []
+    for key in ["audio", "audio_path"]:
+        value = doc.get(key)
+        if value:
+            visuals.append(value)
+            break
+    for key in ["video", "video_path", "file", "path"]:
+        value = doc.get(key)
+        if value:
+            visuals.append(value)
+            break
+    return visuals
+
+
+def av_asr_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "")
+    question = str(doc.get("question", "Transcribe the speech in this video.")).strip()
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def av_asr_doc_to_target(doc):
+    for key in ["text", "transcript", "gt", "answer"]:
+        value = doc.get(key)
+        if value is not None:
+            return str(value)
+    return ""
+
+
+def av_asr_process_results(doc, results):
+    prediction = results[0] if results else ""
+    target = av_asr_doc_to_target(doc)
+    return {"wer": {"gt": target, "pred": prediction}}
+
+
+def av_asr_wer(items):
+    if not items:
+        return 0.0
+    total = 0.0
+    for item in items:
+        total += _word_error_rate(item.get("gt", ""), item.get("pred", ""))
+    return 100.0 * total / len(items)
@@ -0,0 +1,3 @@
+group: anet_qa
+task:
+  - activitynetqa
@@ -0,0 +1,3 @@
+group: egosch_a
+task:
+  - egoschema
@@ -0,0 +1,3 @@
+group: mmmu_a
+task:
+  - mmmu_val
@@ -0,0 +1,28 @@
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: data/countix_test.json
+task: countix
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.countix_doc_to_visual
+doc_to_text: !function utils.countix_doc_to_text
+doc_to_target: !function utils.countix_doc_to_target
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: false
+process_results: !function utils.countix_process_results
+metric_list:
+  - metric: mae_norm
+    aggregation: mean
+    higher_is_better: false
+  - metric: obo
+    aggregation: mean
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with a single integer."
+metadata:
+  - version: 0.0
@@ -0,0 +1,53 @@
+import re
+
+
+def _extract_count(value):
+    if value is None:
+        return None
+    text = str(value).strip().lower().replace(",", "")
+    match = re.search(r"-?\d+(?:\.\d+)?", text)
+    if not match:
+        return None
+    return int(round(float(match.group(0))))
+
+
+def _get_target_count(doc):
+    for key in ["count", "answer", "number", "gt_count", "label"]:
+        target = _extract_count(doc.get(key))
+        if target is not None:
+            return target
+    return None
+
+
+def countix_doc_to_visual(doc):
+    for key in ["video", "video_path", "image", "img", "file", "path"]:
+        value = doc.get(key)
+        if value:
+            return [value]
+    return []
+
+
+def countix_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "")
+    question = str(doc.get("question", "Count the number of repetitions in this clip.")).strip()
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def countix_doc_to_target(doc):
+    target = _get_target_count(doc)
+    return "" if target is None else str(target)
+
+
+def countix_process_results(doc, results):
+    prediction = results[0] if results else ""
+    pred_count = _extract_count(prediction)
+    target_count = _get_target_count(doc)
+
+    if pred_count is None or target_count is None:
+        return {"mae_norm": 0.0 if target_count is None else float(abs(target_count)), "obo": 0.0}
+
+    mae_norm = abs(pred_count - target_count) / (target_count + 0.1)
+    obo = float(abs(pred_count - target_count) <= 1)
+    return {"mae_norm": float(mae_norm), "obo": obo}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+group: anet_qa`
	`2`	`+task:`
	`3`	`+ - activitynetqa`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+group: egosch_a`
	`2`	`+task:`
	`3`	`+ - egoschema`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+group: mmmu_a`
	`2`	`+task:`
	`3`	`+ - mmmu_val`