Merge remote-tracking branch 'origin/main' into dev/v0d4

kcz358 · kcz358 · commit 516631777f27 · 2025-07-07T18:25:12.000-07:00
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
@@ -944,14 +944,7 @@ def _download_from_youtube(path):
                     force_unzip = dataset_kwargs.get("force_unzip", False)
                     revision = dataset_kwargs.get("revision", "main")
                     create_link = dataset_kwargs.get("create_link", False)
-                    cache_path = snapshot_download(
-                        repo_id=self.DATASET_PATH,
-                        cache_dir=cache_dir,
-                        revision=revision,
-                        repo_type="dataset",
-                        force_download=force_download,
-                        etag_timeout=60,
-                    )
+                    cache_path = snapshot_download(repo_id=self.DATASET_PATH, revision=revision, repo_type="dataset", force_download=force_download, etag_timeout=60)
                     zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
                     tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
 
diff --git a/lmms_eval/models/simple/openai_compatible.py b/lmms_eval/models/simple/openai_compatible.py
@@ -38,6 +38,7 @@ def __init__(
         continual_mode: bool = False,
         response_persistent_folder: str = None,
         azure_openai: bool = False,
+        max_frames_num: int = 10,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -46,6 +47,7 @@ def __init__(
         self.max_retries = max_retries
         self.max_size_in_mb = max_size_in_mb  # some models have a limit on the size of the image
         self.continual_mode = continual_mode
+        self.max_frames_num = max_frames_num
         if self.continual_mode:
             if response_persistent_folder is None:
                 raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
diff --git a/lmms_eval/models/simple/vllm.py b/lmms_eval/models/simple/vllm.py
@@ -40,6 +40,7 @@ def __init__(
         threads: int = 16,  # Threads to use for decoding visuals
         trust_remote_code: Optional[bool] = True,
         chat_template: Optional[str] = None,
+        min_image_pixels: int = 28,  # minimum image dimension, required for Qwen 2/2.5-VL models
         **kwargs,
     ) -> None:
         super().__init__()
@@ -50,6 +51,9 @@ def __init__(
         self.max_frame_num = max_frame_num
         self.threads = threads
         self.chat_template = chat_template
+        self.min_image_pixels = min_image_pixels
+        # Qwen 2/2.5-VL models enforce minimum image dimensions
+        self._enforce_image_resize = self._is_qwen_vl_model(model_version)
 
         # Convert any string arguments that start with { and end with } to dictionaries
         for key, value in kwargs.items():
@@ -85,13 +89,32 @@ def __init__(
         self.device = self.accelerator.device
         self.batch_size_per_gpu = int(batch_size)
 
+    def _is_qwen_vl_model(self, model_version: str) -> bool:
+        qwen_vl_patterns = ["qwen2-vl", "qwen2.5-vl"]
+        return any(pattern in model_version.lower() for pattern in qwen_vl_patterns)
+
+    def _maybe_resize_image(self, img: Image.Image) -> Image.Image:
+        # edge‐case validation
+        if self.min_image_pixels <= 0:
+            return img
+        if min(img.size) <= 0:
+            raise ValueError(f"Invalid image dimensions: {img.size}")
+
+        if not self._enforce_image_resize or min(img.size) >= self.min_image_pixels:
+            return img
+
+        scale = self.min_image_pixels / min(img.size)  # maintain original aspect ratio
+        new_size = tuple(int(dim * scale) for dim in img.size)
+        return img.resize(new_size, Image.BICUBIC)
+
     # Function to encode the image
     def encode_image(self, image: Union[Image.Image, str]):
         if isinstance(image, str):
             img = Image.open(image).convert("RGB")
         else:
             img = image.copy()
 
+        img = self._maybe_resize_image(img)
         output_buffer = BytesIO()
         img.save(output_buffer, format="PNG")
         byte_data = output_buffer.getvalue()
@@ -115,6 +138,7 @@ def encode_video(self, video_path):
         base64_frames = []
         for frame in frames:
             img = Image.fromarray(frame)
+            img = self._maybe_resize_image(img)
             output_buffer = BytesIO()
             img.save(output_buffer, format="PNG")
             byte_data = output_buffer.getvalue()
diff --git a/lmms_eval/tasks/scienceqa/utils.py b/lmms_eval/tasks/scienceqa/utils.py
@@ -34,11 +34,11 @@ def sqa_doc_to_target(doc):
 def sqa_process_results(doc, results):
     # I know this is weird, but it's how llava parse it.
     target = sqa_doc_to_target(doc).strip().lower()
-    pred = results[0].strip().lower()
-    if pred == target:
+    pred = results[0].strip()
+    if pred.lower() == target:
         return {"exact_match": 1.0}
     # pattern: ^[A-Z]\. .*
     if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".":
-        result = 1.0 if pred[0] == target else 0.0
+        result = 1.0 if pred[0].lower() == target else 0.0
         return {"exact_match": result}
     return {"exact_match": 0.0}