Skip to content

Commit 5166317

Browse files
committed
Merge remote-tracking branch 'origin/main' into dev/v0d4
2 parents aa9b215 + e24a7d8 commit 5166317

4 files changed

Lines changed: 30 additions & 11 deletions

File tree

lmms_eval/api/task.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -944,14 +944,7 @@ def _download_from_youtube(path):
944944
force_unzip = dataset_kwargs.get("force_unzip", False)
945945
revision = dataset_kwargs.get("revision", "main")
946946
create_link = dataset_kwargs.get("create_link", False)
947-
cache_path = snapshot_download(
948-
repo_id=self.DATASET_PATH,
949-
cache_dir=cache_dir,
950-
revision=revision,
951-
repo_type="dataset",
952-
force_download=force_download,
953-
etag_timeout=60,
954-
)
947+
cache_path = snapshot_download(repo_id=self.DATASET_PATH, revision=revision, repo_type="dataset", force_download=force_download, etag_timeout=60)
955948
zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
956949
tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
957950

lmms_eval/models/simple/openai_compatible.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def __init__(
3838
continual_mode: bool = False,
3939
response_persistent_folder: str = None,
4040
azure_openai: bool = False,
41+
max_frames_num: int = 10,
4142
**kwargs,
4243
) -> None:
4344
super().__init__()
@@ -46,6 +47,7 @@ def __init__(
4647
self.max_retries = max_retries
4748
self.max_size_in_mb = max_size_in_mb # some models have a limit on the size of the image
4849
self.continual_mode = continual_mode
50+
self.max_frames_num = max_frames_num
4951
if self.continual_mode:
5052
if response_persistent_folder is None:
5153
raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")

lmms_eval/models/simple/vllm.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(
4040
threads: int = 16, # Threads to use for decoding visuals
4141
trust_remote_code: Optional[bool] = True,
4242
chat_template: Optional[str] = None,
43+
min_image_pixels: int = 28, # minimum image dimension, required for Qwen 2/2.5-VL models
4344
**kwargs,
4445
) -> None:
4546
super().__init__()
@@ -50,6 +51,9 @@ def __init__(
5051
self.max_frame_num = max_frame_num
5152
self.threads = threads
5253
self.chat_template = chat_template
54+
self.min_image_pixels = min_image_pixels
55+
# Qwen 2/2.5-VL models enforce minimum image dimensions
56+
self._enforce_image_resize = self._is_qwen_vl_model(model_version)
5357

5458
# Convert any string arguments that start with { and end with } to dictionaries
5559
for key, value in kwargs.items():
@@ -85,13 +89,32 @@ def __init__(
8589
self.device = self.accelerator.device
8690
self.batch_size_per_gpu = int(batch_size)
8791

92+
def _is_qwen_vl_model(self, model_version: str) -> bool:
93+
qwen_vl_patterns = ["qwen2-vl", "qwen2.5-vl"]
94+
return any(pattern in model_version.lower() for pattern in qwen_vl_patterns)
95+
96+
def _maybe_resize_image(self, img: Image.Image) -> Image.Image:
97+
# edge‐case validation
98+
if self.min_image_pixels <= 0:
99+
return img
100+
if min(img.size) <= 0:
101+
raise ValueError(f"Invalid image dimensions: {img.size}")
102+
103+
if not self._enforce_image_resize or min(img.size) >= self.min_image_pixels:
104+
return img
105+
106+
scale = self.min_image_pixels / min(img.size) # maintain original aspect ratio
107+
new_size = tuple(int(dim * scale) for dim in img.size)
108+
return img.resize(new_size, Image.BICUBIC)
109+
88110
# Function to encode the image
89111
def encode_image(self, image: Union[Image.Image, str]):
90112
if isinstance(image, str):
91113
img = Image.open(image).convert("RGB")
92114
else:
93115
img = image.copy()
94116

117+
img = self._maybe_resize_image(img)
95118
output_buffer = BytesIO()
96119
img.save(output_buffer, format="PNG")
97120
byte_data = output_buffer.getvalue()
@@ -115,6 +138,7 @@ def encode_video(self, video_path):
115138
base64_frames = []
116139
for frame in frames:
117140
img = Image.fromarray(frame)
141+
img = self._maybe_resize_image(img)
118142
output_buffer = BytesIO()
119143
img.save(output_buffer, format="PNG")
120144
byte_data = output_buffer.getvalue()

lmms_eval/tasks/scienceqa/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ def sqa_doc_to_target(doc):
3434
def sqa_process_results(doc, results):
3535
# I know this is weird, but it's how llava parse it.
3636
target = sqa_doc_to_target(doc).strip().lower()
37-
pred = results[0].strip().lower()
38-
if pred == target:
37+
pred = results[0].strip()
38+
if pred.lower() == target:
3939
return {"exact_match": 1.0}
4040
# pattern: ^[A-Z]\. .*
4141
if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".":
42-
result = 1.0 if pred[0] == target else 0.0
42+
result = 1.0 if pred[0].lower() == target else 0.0
4343
return {"exact_match": result}
4444
return {"exact_match": 0.0}

0 commit comments

Comments
 (0)