Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,4 @@ span.log
uv.lock
workspace/*
.claude/*
remote_code/*
1 change: 1 addition & 0 deletions lmms_eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
"sglang": "Sglang",
"huggingface": "Huggingface",
"async_openai": "AsyncOpenAIChat",
"longvila": "LongVila",
}


Expand Down
184 changes: 184 additions & 0 deletions lmms_eval/models/chat/longvila.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from typing import List, Optional, Tuple, Union

from tqdm import tqdm
from transformers import AutoModel

from lmms_eval.api.instance import Instance
from lmms_eval.api.registry import register_model
from lmms_eval.models.model_utils.gen_metrics import log_metrics
from lmms_eval.models.simple.vllm import VLLM as VLLMSimple
from lmms_eval.protocol import ChatMessages

try:
from vllm import LLM, SamplingParams
except ImportError:
vllm = None

Comment on lines +16 to +20
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

vLLM import fallback can crash later; fail fast with actionable error

If vllm isn’t installed, SamplingParams is undefined but later used. Guard and raise with guidance.

-try:
-    from vllm import LLM, SamplingParams
-except ImportError:
-    vllm = None
+try:
+    from vllm import LLM, SamplingParams
+except ImportError as _vllm_err:
+    LLM = None  # type: ignore[assignment]
+    SamplingParams = None  # type: ignore[assignment]
+    _VLLM_IMPORT_ERROR = _vllm_err

And in __init__:

     def __init__(
@@
     ):
+        if LLM is None or SamplingParams is None:
+            raise ImportError(
+                "vLLM is required for LongVila. Install with `pip install vllm`."
+            ) from _VLLM_IMPORT_ERROR
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
try:
from vllm import LLM, SamplingParams
except ImportError:
vllm = None
try:
from vllm import LLM, SamplingParams
except ImportError as _vllm_err:
LLM = None # type: ignore[assignment]
SamplingParams = None # type: ignore[assignment]
_VLLM_IMPORT_ERROR = _vllm_err
🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 16-20, the vllm import
fallback leaves SamplingParams undefined which will cause a crash later; modify
the import block to fail fast by raising an ImportError (or RuntimeError) with
an actionable message instructing to install vllm (and optionally provide pip
install instructions and mention SamplingParams/LLM are required), and update
the class __init__ to check that vllm and SamplingParams are available before
using them, raising the same clear error if they are missing so callers get
immediate, actionable feedback instead of a later NameError.

WORKERS = int(os.getenv("WORKERS", "32"))


@register_model("longvila")
class LongVila(VLLMSimple):
is_simple = False

def __init__(
self,
model="Efficient-Large-Model/LongVILA-R1-7B",
tensor_parallel_size=1,
data_parallel_size=1,
gpu_memory_utilization=0.5,
batch_size=1,
max_frame_num=32,
trust_remote_code=True,
chat_template=None,
max_pixels: int = 1605632,
min_image_pixels=28,
fps: Optional[int] = None,
device_map: Optional[str] = "cuda",
**kwargs,
):
# vLLM requires the path to the autoregressive llm weights under the model root
model_root = model
llm_path = os.path.join(model_root, "llm")
# Enable prompt embeddings so we can pass encoder-produced embeddings directly
Comment on lines +28 to +47
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Validate model paths and pass-through trust flag

Check that llm_path exists to fail early with a clear message; also forward trust_remote_code consistently.

         model_root = model
         llm_path = os.path.join(model_root, "llm")
+        if not os.path.isdir(llm_path):
+            raise FileNotFoundError(
+                f"Expected autoregressive LLM under '{llm_path}'. "
+                "Verify your LongVILA model layout."
+            )
@@
-        kwargs["enable_prompt_embeds"] = True
+        kwargs["enable_prompt_embeds"] = True
@@
-        super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs)
+        super().__init__(
+            llm_path,
+            tensor_parallel_size,
+            data_parallel_size,
+            gpu_memory_utilization,
+            batch_size,
+            max_frame_num,
+            trust_remote_code,
+            chat_template,
+            min_image_pixels,
+            **kwargs,
+        )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def __init__(
self,
model="Efficient-Large-Model/LongVILA-R1-7B",
tensor_parallel_size=1,
data_parallel_size=1,
gpu_memory_utilization=0.5,
batch_size=1,
max_frame_num=32,
trust_remote_code=True,
chat_template=None,
max_pixels: int = 1605632,
min_image_pixels=28,
fps: Optional[int] = None,
device_map: Optional[str] = "cuda",
**kwargs,
):
# vLLM requires the path to the autoregressive llm weights under the model root
model_root = model
llm_path = os.path.join(model_root, "llm")
# Enable prompt embeddings so we can pass encoder-produced embeddings directly
def __init__(
self,
model="Efficient-Large-Model/LongVILA-R1-7B",
tensor_parallel_size=1,
data_parallel_size=1,
gpu_memory_utilization=0.5,
batch_size=1,
max_frame_num=32,
trust_remote_code=True,
chat_template=None,
max_pixels: int = 1605632,
min_image_pixels=28,
fps: Optional[int] = None,
device_map: Optional[str] = "cuda",
**kwargs,
):
# vLLM requires the path to the autoregressive llm weights under the model root
model_root = model
llm_path = os.path.join(model_root, "llm")
if not os.path.isdir(llm_path):
raise FileNotFoundError(
f"Expected autoregressive LLM under '{llm_path}'. "
"Verify your LongVILA model layout."
)
# Enable prompt embeddings so we can pass encoder-produced embeddings directly
kwargs["enable_prompt_embeds"] = True
super().__init__(
llm_path,
tensor_parallel_size,
data_parallel_size,
gpu_memory_utilization,
batch_size,
max_frame_num,
trust_remote_code,
chat_template,
min_image_pixels,
**kwargs,
)
🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 28 to 47, add an explicit
existence check for the computed llm_path and raise a clear FileNotFoundError if
it doesn't exist so the class fails fast with a helpful message; also ensure the
incoming trust_remote_code flag is stored on self (e.g., self.trust_remote_code
= trust_remote_code) and passed through to any downstream model-loading calls
that accept trust_remote_code so the same behavior is consistently applied.

kwargs["enable_prompt_embeds"] = True
self.fps = fps
self.max_pixels = max_pixels

# Set up imports from the model's remote_code directory
# The LongVILA repo provides preprocessing utilities we must call directly
try:
from remote_code.media import extract_media as _extract_media
from remote_code.mm_utils import process_images as _process_images
from remote_code.tokenizer_utils import (
tokenize_conversation as _tokenize_conversation,
)
except Exception as e:
raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")

self.extract_media = _extract_media
self.process_images = _process_images
self.tokenize_conversation = _tokenize_conversation

Comment on lines +52 to +66
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

remote_code import path will fail unless it’s on sys.path

The code imports remote_code.* without adding model_root to sys.path. Load explicitly from the model folder to avoid relying on a repo-level remote_code/.

-        try:
-            from remote_code.media import extract_media as _extract_media
-            from remote_code.mm_utils import process_images as _process_images
-            from remote_code.tokenizer_utils import (
-                tokenize_conversation as _tokenize_conversation,
-            )
-        except Exception as e:
-            raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
+        try:
+            remote_code_dir = os.path.join(model_root)
+            if remote_code_dir not in sys.path:
+                sys.path.insert(0, remote_code_dir)
+            from remote_code.media import extract_media as _extract_media  # type: ignore
+            from remote_code.mm_utils import process_images as _process_images  # type: ignore
+            from remote_code.tokenizer_utils import (  # type: ignore
+                tokenize_conversation as _tokenize_conversation,
+            )
+        except Exception as err:
+            raise ImportError(
+                f"Failed to import LongVILA remote_code from '{model_root}'. "
+                "Ensure the model path contains a 'remote_code/' package."
+            ) from err
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Set up imports from the model's remote_code directory
# The LongVILA repo provides preprocessing utilities we must call directly
try:
from remote_code.media import extract_media as _extract_media
from remote_code.mm_utils import process_images as _process_images
from remote_code.tokenizer_utils import (
tokenize_conversation as _tokenize_conversation,
)
except Exception as e:
raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
self.extract_media = _extract_media
self.process_images = _process_images
self.tokenize_conversation = _tokenize_conversation
# Set up imports from the model's remote_code directory
# The LongVILA repo provides preprocessing utilities we must call directly
try:
remote_code_dir = os.path.join(model_root)
if remote_code_dir not in sys.path:
sys.path.insert(0, remote_code_dir)
from remote_code.media import extract_media as _extract_media # type: ignore
from remote_code.mm_utils import process_images as _process_images # type: ignore
from remote_code.tokenizer_utils import ( # type: ignore
tokenize_conversation as _tokenize_conversation,
)
except Exception as err:
raise ImportError(
f"Failed to import LongVILA remote_code from '{model_root}'. "
"Ensure the model path contains a 'remote_code/' package."
) from err
self.extract_media = _extract_media
self.process_images = _process_images
self.tokenize_conversation = _tokenize_conversation
🧰 Tools
🪛 Ruff (0.12.2)

60-60: Do not catch blind exception: Exception

(BLE001)


61-61: Within an except clause, raise exceptions with raise ... from err or raise ... from None to distinguish them from errors in exception handling

(B904)


61-61: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 52 to 66, the imports from
remote_code will fail unless model_root is on sys.path; prepend the absolute
model_root to sys.path before the try block (and optionally pop it afterward) so
Python can import remote_code from the model folder, ensure model_root exists
and is resolved with os.path.abspath, and keep the existing try/except and
attribute assignments intact.

# Load the encoder that produces prompt embeddings for the LLM
# llm_only_need_embed reduces memory usage to only what's needed for embedding
self.model_encoder = AutoModel.from_pretrained(
model_root,
trust_remote_code=True,
device_map=device_map,
llm_only_need_embed=True,
)
super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs)

def _to_remote_conversation(self, chat_messages: ChatMessages) -> list:
"""
Convert ChatMessages to LongVILA remote_code conversation format.
[{"from": "human"|"gpt", "value": [str | {"path": media_path}, ...]}, ...]
"""
role_map = {"user": "human", "assistant": "gpt", "system": "human"}
conversation = []
for msg in chat_messages.messages:
from_role = role_map.get(msg.role, "human")
value_parts = []
for content in msg.content:
# ChatTextContent
if getattr(content, "type", None) == "text":
value_parts.append(content.text)
# Images, Videos, Audios -> use path dicts as required by tokenizer_utils
elif getattr(content, "type", None) in ("image", "video", "audio"):
value_parts.append({"path": content.url})
if value_parts:
conversation.append({"from": from_role, "value": value_parts})
return conversation

def make_one_request(self, request: Instance) -> Tuple["object", dict]:
"""
Build prompt embeddings and per-request sampling params from an Instance.
Returns (inputs_embeds, params_dict). Does not mutate input.
"""
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages = ChatMessages(messages=raw_messages)

# Copy to avoid side-effects across threads
_gen = dict(gen_kwargs or {})
_gen.setdefault("max_new_tokens", 4096)
_gen.setdefault("temperature", 0)
_gen.setdefault("top_p", 0.95)

params = {
"temperature": _gen["temperature"],
"max_tokens": _gen["max_new_tokens"],
"top_p": _gen["top_p"],
}

# Convert to LongVILA remote_code conversation format
conversation = self._to_remote_conversation(chat_messages)

# Extract and preprocess media
if self.fps:
self.model_encoder.config.fps = self.fps
else:
self.model_encoder.config.num_video_frames = self.max_frame_num
self.model_encoder.config.fps = 0
media = self.extract_media(conversation, self.model_encoder.config)
if "video" in media and media["video"] is not None:
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]

# Tokenize conversation and move to CUDA for embedding
input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()

# Create prompt embeddings using the model encoder
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)

return inputs_embeds, params

Comment on lines +98 to +139
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Device placement: avoid unconditional .cuda()

Calling .cuda() breaks on CPU-only setups or non-CUDA devices. Derive device from the encoder.

-        input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
+        input_ids = (
+            self.tokenize_conversation(
+                conversation,
+                self.model_encoder.tokenizer,
+                add_generation_prompt=True,
+            )
+            .unsqueeze(0)
+            .to(next(self.model_encoder.parameters()).device)
+        )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def make_one_request(self, request: Instance) -> Tuple["object", dict]:
"""
Build prompt embeddings and per-request sampling params from an Instance.
Returns (inputs_embeds, params_dict). Does not mutate input.
"""
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages = ChatMessages(messages=raw_messages)
# Copy to avoid side-effects across threads
_gen = dict(gen_kwargs or {})
_gen.setdefault("max_new_tokens", 4096)
_gen.setdefault("temperature", 0)
_gen.setdefault("top_p", 0.95)
params = {
"temperature": _gen["temperature"],
"max_tokens": _gen["max_new_tokens"],
"top_p": _gen["top_p"],
}
# Convert to LongVILA remote_code conversation format
conversation = self._to_remote_conversation(chat_messages)
# Extract and preprocess media
if self.fps:
self.model_encoder.config.fps = self.fps
else:
self.model_encoder.config.num_video_frames = self.max_frame_num
self.model_encoder.config.fps = 0
media = self.extract_media(conversation, self.model_encoder.config)
if "video" in media and media["video"] is not None:
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
# Tokenize conversation and move to CUDA for embedding
input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
# Create prompt embeddings using the model encoder
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
return inputs_embeds, params
def make_one_request(self, request: Instance) -> Tuple["object", dict]:
"""
Build prompt embeddings and per-request sampling params from an Instance.
Returns (inputs_embeds, params_dict). Does not mutate input.
"""
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages = ChatMessages(messages=raw_messages)
# Copy to avoid side-effects across threads
_gen = dict(gen_kwargs or {})
_gen.setdefault("max_new_tokens", 4096)
_gen.setdefault("temperature", 0)
_gen.setdefault("top_p", 0.95)
params = {
"temperature": _gen["temperature"],
"max_tokens": _gen["max_new_tokens"],
"top_p": _gen["top_p"],
}
# Convert to LongVILA remote_code conversation format
conversation = self._to_remote_conversation(chat_messages)
# Extract and preprocess media
if self.fps:
self.model_encoder.config.fps = self.fps
else:
self.model_encoder.config.num_video_frames = self.max_frame_num
self.model_encoder.config.fps = 0
media = self.extract_media(conversation, self.model_encoder.config)
if "video" in media and media["video"] is not None:
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
# Tokenize conversation and move to CUDA for embedding
input_ids = (
self.tokenize_conversation(
conversation,
self.model_encoder.tokenizer,
add_generation_prompt=True,
)
.unsqueeze(0)
.to(next(self.model_encoder.parameters()).device)
)
# Create prompt embeddings using the model encoder
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
return inputs_embeds, params

def generate_until(self, requests) -> List[str]:
res = []
self.load_cache()
res, requests = self.get_response_from_cache(requests)
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

batch_size = self.batch_size_per_gpu
batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)]
e2e_latency = 0
for batch_requests in batched_requests:
prompt_embeds_list = []
params_list = []
# Build embeddings sequentially to avoid GPU contention in the encoder
for req in tqdm(batch_requests, disable=(self.rank != 0), desc="Building embeddings"):
inputs_embeds, params = self.make_one_request(req)
prompt_embeds_list.append({"prompt_embeds": inputs_embeds.squeeze(0)})
params_list.append(params)

# For now, assume homogeneous sampling params within a batch
sampling_params = SamplingParams(**params_list[-1])

start_time = time.time()
response = self.client.generate(prompts=prompt_embeds_list, sampling_params=sampling_params)
end_time = time.time()

response_text = [o.outputs[0].text for o in response]
for req, text in zip(batch_requests, response_text):
self.add_request_response_to_cache(req, text)

# Calculate timing metrics for batch
e2e_latency += end_time - start_time

assert len(response_text) == len(batch_requests)
res.extend(response_text)
pbar.update(len(batch_requests))

pbar.close()
return res

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4V not support"

def generate_until_multi_round(self, requests) -> List[str]:
raise NotImplementedError("TODO: Implement multi-round generation")
Comment on lines +179 to +184
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Replace assert False and remove unused arg warning

Use a proper exception and underscore the unused parameter.

-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        # TODO
-        assert False, "GPT4V not support"
+    def loglikelihood(self, _requests: List[Instance]) -> List[Tuple[float, bool]]:  # noqa: ARG002
+        raise NotImplementedError("LongVila does not support loglikelihood.")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4V not support"
def generate_until_multi_round(self, requests) -> List[str]:
raise NotImplementedError("TODO: Implement multi-round generation")
def loglikelihood(self, _requests: List[Instance]) -> List[Tuple[float, bool]]: # noqa: ARG002
raise NotImplementedError("LongVila does not support loglikelihood.")
def generate_until_multi_round(self, requests) -> List[str]:
raise NotImplementedError("TODO: Implement multi-round generation")
🧰 Tools
🪛 Ruff (0.12.2)

179-179: Unused method argument: requests

(ARG002)


181-181: Do not assert False (python -O removes these calls), raise AssertionError()

Replace assert False

(B011)

🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 179 to 184, replace the assert
False with a proper exception and silence the unused-parameter warning by
renaming the argument to _requests: implement loglikelihood to raise
NotImplementedError("GPT4V not supported") instead of asserting, and change
generate_until_multi_round(self, requests) to generate_until_multi_round(self,
_requests) (or similarly prefix the parameter with an underscore) and keep its
NotImplementedError; this removes the assert and the unused-arg warning while
clearly signaling unimplemented functionality.

28 changes: 28 additions & 0 deletions lmms_eval/tasks/lvbench/lvbench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
dataset_path: lmms-lab/LVBench
dataset_kwargs:
token: True
cache_dir: lvbench
video: True
# From_YouTube: True
test_split: train
task: lvbench
output_type: generate_until
doc_to_visual: !function utils.lvbench_doc_to_visual
doc_to_text: !function utils.lvbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
# The return value of process_results will be used by metrics
process_results: !function utils.lvbench_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: lvbench_score
aggregation: mean
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer the question with the option letter"
metadata:
- version: 0.0

74 changes: 74 additions & 0 deletions lmms_eval/tasks/lvbench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import re
from pathlib import Path

import yaml

hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
Comment on lines +7 to +16
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Harden YAML parsing and fix unused loop var

  • Rename unused i to _i (Ruff B007).
  • Add defensive checks around YAML keys to avoid KeyError if the schema changes.

Apply:

-raw_data = f.readlines()
-safe_data = []
-for i, line in enumerate(raw_data):
+raw_data = f.readlines()
+safe_data = []
+for _i, line in enumerate(raw_data):
     # remove function definition since yaml load cannot handle it
     if "!function" not in line:
         safe_data.append(line)
-cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+data = yaml.safe_load("".join(safe_data)) or {}
+try:
+    cache_name = data["dataset_kwargs"]["cache_dir"]
+except KeyError as exc:
+    raise KeyError("Expected 'dataset_kwargs.cache_dir' in lvbench.yaml") from exc
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for _i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
data = yaml.safe_load("".join(safe_data)) or {}
try:
cache_name = data["dataset_kwargs"]["cache_dir"]
except KeyError as exc:
raise KeyError("Expected 'dataset_kwargs.cache_dir' in lvbench.yaml") from exc
🧰 Tools
🪛 Ruff (0.12.2)

12-12: Loop control variable i not used within loop body

Rename unused i to _i

(B007)

🤖 Prompt for AI Agents
In lmms_eval/tasks/lvbench/utils.py around lines 7 to 16, the loop uses an
unused variable `i` and the YAML access assumes keys exist which can raise
KeyError; rename `i` to `_i` to satisfy Ruff B007 and add defensive checks when
loading the YAML (e.g., check that the top-level mapping exists and contains
"dataset_kwargs" and "cache_dir" keys, raising a clear error or falling back to
a default) before assigning cache_name so the code won't crash if the schema
changes.



def lvbench_doc_to_visual(doc):
cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = doc["video_path"]
assert os.path.exists(os.path.join(cache_dir, video_path))
video_path = os.path.join(cache_dir, video_path)
return [video_path]
Comment on lines +19 to +24
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid asserts for I/O; add types and clearer errors

Use Path checks and raise FileNotFoundError. Add type hints.

-def lvbench_doc_to_visual(doc):
-    cache_dir = os.path.join(base_cache_dir, cache_name)
-    video_path = doc["video_path"]
-    assert os.path.exists(os.path.join(cache_dir, video_path))
-    video_path = os.path.join(cache_dir, video_path)
-    return [video_path]
+from typing import Dict, List
+
+def lvbench_doc_to_visual(doc: Dict) -> List[str]:
+    cache_dir = Path(base_cache_dir) / cache_name
+    video_path = Path(doc["video_path"])
+    abs_path = video_path if video_path.is_absolute() else (cache_dir / video_path)
+    if not abs_path.exists():
+        raise FileNotFoundError(f"Video not found: {abs_path}")
+    return [str(abs_path)]

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In lmms_eval/tasks/lvbench/utils.py around lines 19 to 24, the function uses an
assert for an I/O check and lacks type hints; replace the assert with
pathlib.Path checks and raise FileNotFoundError with a clear message if the file
is missing, add type hints (e.g., doc: Mapping[str, Any] -> List[Path]) on the
function signature, construct video_path as a Path using
base_cache_dir/cache_name and doc["video_path"], and return a list of Path
objects; ensure pathlib.Path is imported and adjust any callers if they expect
strings.



def lvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}
if "pre_prompt" not in lmms_eval_specific_kwargs:
lmms_eval_specific_kwargs["pre_prompt"] = ""
if "post_prompt" not in lmms_eval_specific_kwargs:
lmms_eval_specific_kwargs["post_prompt"] = "\nAnswer the question with the option letter"
return lmms_eval_specific_kwargs["pre_prompt"] + doc["question"] + lmms_eval_specific_kwargs["post_prompt"]


def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is" "The correct option is",
"Best answer:" "Best option:",
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, "")

if len(s.split()) > 10 and not re.search("[ABCD]", s):
return ""

matches = re.search(r"[ABCD]", s)
if matches is None:
return ""
return matches[0]
Comment on lines +37 to +56
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix concatenated strings bug and improve extraction robustness

Missing commas are concatenating literals (e.g., "Best answer:" "Best option:"). Also prefer case-insensitive matching and strip punctuation.

-def extract_characters_regex(s):
+from typing import Optional
+def extract_characters_regex(s: str) -> str:
     s = s.strip()
     answer_prefixes = [
         "The best answer is",
         "The correct answer is",
         "The answer is",
         "The answer",
-        "The best option is" "The correct option is",
-        "Best answer:" "Best option:",
+        "The best option is",
+        "The correct option is",
+        "Best answer:",
+        "Best option:",
     ]
     for answer_prefix in answer_prefixes:
-        s = s.replace(answer_prefix, "")
+        s = re.sub(re.escape(answer_prefix), "", s, flags=re.IGNORECASE).strip()
 
-    if len(s.split()) > 10 and not re.search("[ABCD]", s):
+    if len(s.split()) > 10 and not re.search(r"\b[ABCD]\b", s, flags=re.I):
         return ""
 
-    matches = re.search(r"[ABCD]", s)
+    matches = re.search(r"\b([ABCD])\b", s, flags=re.I)
     if matches is None:
         return ""
-    return matches[0]
+    return matches.group(1).upper()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is" "The correct option is",
"Best answer:" "Best option:",
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, "")
if len(s.split()) > 10 and not re.search("[ABCD]", s):
return ""
matches = re.search(r"[ABCD]", s)
if matches is None:
return ""
return matches[0]
from typing import Optional
def extract_characters_regex(s: str) -> str:
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is",
"The correct option is",
"Best answer:",
"Best option:",
]
for answer_prefix in answer_prefixes:
s = re.sub(re.escape(answer_prefix), "", s, flags=re.IGNORECASE).strip()
if len(s.split()) > 10 and not re.search(r"\b[ABCD]\b", s, flags=re.I):
return ""
matches = re.search(r"\b([ABCD])\b", s, flags=re.I)
if matches is None:
return ""
return matches.group(1).upper()
🤖 Prompt for AI Agents
In lmms_eval/tasks/lvbench/utils.py around lines 37 to 56, the answer_prefix
list has missing commas causing string concatenation and the character
extraction is brittle; fix by inserting the missing commas between list
literals, perform prefix removal and then normalize s (lower/upper consistently)
and strip surrounding punctuation, use a case-insensitive regex (e.g.,
re.search(r"\b[ABCD]\b", s, flags=re.IGNORECASE) or re.search(r"[ABCD]", s,
flags=re.IGNORECASE)) to find the first A/B/C/D, and return the uppercase
character (or "" if none); also keep the existing short-word-length guard but
apply it after normalization.



def lvbench_process_results(doc, results):
"""
Args:
doc: a instance of the eval dataset
results: [pred]
Returns:
a dictionary with key: metric name (in this case videomme score), value: metric value
"""
pred = results[0]
pred_ans = extract_characters_regex(pred)
# gt_ans = doc["answer"].lower().strip().replace(".", "")
gt_ans = doc["answer"]
score = pred_ans == gt_ans

# return {f"videomme_perception_score": data_dict for metric in matrices}
return {f"lvbench_score": score}