Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,4 @@ span.log
uv.lock
workspace/*
.claude/*
remote_code/*
2 changes: 1 addition & 1 deletion lmms_eval/loggers/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def extract_model_name(model_args: str, key: str) -> str:
return args_after_key.split(",")[0]

# order does matter, e.g. peft and delta are provided together with pretrained
prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine=", "model_version=", "model_name=", "model_id="]
for prefix in prefixes:
if prefix in model_args:
return extract_model_name(model_args, prefix)
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
"sglang": "Sglang",
"huggingface": "Huggingface",
"async_openai": "AsyncOpenAIChat",
"longvila": "LongVila",
}


Expand Down
8 changes: 7 additions & 1 deletion lmms_eval/models/chat/async_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,17 @@ async def maybe_forward_with_tool(self, request: Instance, idx: int):
for call in message.tool_calls:
eval_logger.debug(f"Calling {call.function.name}...")
result = await self.mcp_client.run_tool(call.function.name, eval(call.function.arguments))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Blocker: Replace eval(...) of tool arguments with safe JSON parsing

Evaluating LLM-provided strings is an RCE vector. Parse as JSON and fall back safely.

-                    result = await self.mcp_client.run_tool(call.function.name, eval(call.function.arguments))
+                    raw_args = call.function.arguments
+                    if isinstance(raw_args, str):
+                        try:
+                            args = json.loads(raw_args)
+                        except Exception as e:
+                            eval_logger.warning(f"Failed to parse tool args for {call.function.name}: {e}")
+                            args = {}
+                    elif raw_args is None:
+                        args = {}
+                    else:
+                        args = raw_args
+                    result = await self.mcp_client.run_tool(call.function.name, args)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
result = await self.mcp_client.run_tool(call.function.name, eval(call.function.arguments))
raw_args = call.function.arguments
if isinstance(raw_args, str):
try:
args = json.loads(raw_args)
except Exception as e:
eval_logger.warning(f"Failed to parse tool args for {call.function.name}: {e}")
args = {}
elif raw_args is None:
args = {}
else:
args = raw_args
result = await self.mcp_client.run_tool(call.function.name, args)
🧰 Tools
🪛 Ruff (0.12.2)

186-186: Use of possibly insecure function; consider using ast.literal_eval

(S307)

🤖 Prompt for AI Agents
In lmms_eval/models/chat/async_openai.py around line 186, replace the unsafe
eval(call.function.arguments) with safe JSON parsing: import json at the top,
then attempt to parse call.function.arguments via json.loads in a try/except
(catch json.JSONDecodeError, TypeError, ValueError) and on success pass the
parsed object to run_tool; on failure log/warn about the parse error and fall
back to a safe default (e.g., {} or the raw string wrapped appropriately) so no
arbitrary code is executed.

all_response += f"<tool_call>{call.function.name} {call.function.arguments}</tool_call>"
all_response += f"<tool_call>{call.function.name} {call.function.arguments}</tool_call></tool_response>"
tool_messages.append({"role": "tool", "name": call.function.name, "content": []})
for content in result.content:
tool_message = self.mcp_client.convert_result_to_openai_format(content)
for content in tool_message:
if content["type"] == "image_url":
all_response += "<image_url>"
elif content["type"] == "text":
all_response += content["text"]
tool_messages[-1]["content"].extend(tool_message)
all_response += "</tool_response>"
Comment on lines +187 to +197
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix malformed tool_response markup and include tool_call_id on tool messages

  • The first line appends a stray closing </tool_response> without an opener (and you close again later).
  • Add tool_call_id to tool messages to align with OpenAI Chat Completions tool schema.
  • Minor: avoid reusing the name content inside the inner loop; add audio_url handling.
-                    all_response += f"<tool_call>{call.function.name} {call.function.arguments}</tool_call></tool_response>"
-                    tool_messages.append({"role": "tool", "name": call.function.name, "content": []})
+                    all_response += f"<tool_call>{call.function.name} {call.function.arguments}</tool_call><tool_response>"
+                    tool_messages.append(
+                        {
+                            "role": "tool",
+                            "tool_call_id": call.id,
+                            "name": call.function.name,
+                            "content": [],
+                        }
+                    )
                     for content in result.content:
                         tool_message = self.mcp_client.convert_result_to_openai_format(content)
-                        for content in tool_message:
-                            if content["type"] == "image_url":
+                        for item in tool_message:
+                            if item["type"] == "image_url":
                                 all_response += "<image_url>"
-                            elif content["type"] == "text":
-                                all_response += content["text"]
-                        tool_messages[-1]["content"].extend(tool_message)
+                            elif item["type"] == "audio_url":
+                                all_response += "<audio_url>"
+                            elif item["type"] == "text":
+                                all_response += item["text"]
+                        tool_messages[-1]["content"].extend(tool_message)
                     all_response += "</tool_response>"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
all_response += f"<tool_call>{call.function.name} {call.function.arguments}</tool_call></tool_response>"
tool_messages.append({"role": "tool", "name": call.function.name, "content": []})
for content in result.content:
tool_message = self.mcp_client.convert_result_to_openai_format(content)
for content in tool_message:
if content["type"] == "image_url":
all_response += "<image_url>"
elif content["type"] == "text":
all_response += content["text"]
tool_messages[-1]["content"].extend(tool_message)
all_response += "</tool_response>"
all_response += f"<tool_call>{call.function.name} {call.function.arguments}</tool_call><tool_response>"
tool_messages.append(
{
"role": "tool",
"tool_call_id": call.id,
"name": call.function.name,
"content": [],
}
)
for content in result.content:
tool_message = self.mcp_client.convert_result_to_openai_format(content)
for item in tool_message:
if item["type"] == "image_url":
all_response += "<image_url>"
elif item["type"] == "audio_url":
all_response += "<audio_url>"
elif item["type"] == "text":
all_response += item["text"]
tool_messages[-1]["content"].extend(tool_message)
all_response += "</tool_response>"
🤖 Prompt for AI Agents
In lmms_eval/models/chat/async_openai.py around lines 187 to 197, the code emits
a stray closing </tool_response> before the opening tag, doesn't include
tool_call_id on tool messages, reuses the variable name "content" in nested
loops, and lacks audio_url handling; fix by removing the premature
</tool_response>, prepend a proper <tool_response> (including tool_call id and
call.function.name/arguments) before writing the tool call tag, add
"tool_call_id": call.id to the tool_messages entries, rename the inner loop
variable to avoid shadowing (e.g., result_item), add handling for content types
"image_url" and "audio_url" (append appropriate markers and include them in
tool_messages[-1]["content"]), and ensure the final closing </tool_response>
remains only at the end.


response = await self.client.chat.completions.create(
model=self.model_version,
Expand Down
184 changes: 184 additions & 0 deletions lmms_eval/models/chat/longvila.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from typing import List, Optional, Tuple, Union

from tqdm import tqdm
from transformers import AutoModel

from lmms_eval.api.instance import Instance
from lmms_eval.api.registry import register_model
from lmms_eval.models.model_utils.gen_metrics import log_metrics
from lmms_eval.models.simple.vllm import VLLM as VLLMSimple
from lmms_eval.protocol import ChatMessages

try:
from vllm import LLM, SamplingParams
except ImportError:
vllm = None

Comment on lines +16 to +20
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

vLLM import fallback can crash later; fail fast with actionable error

If vllm isn’t installed, SamplingParams is undefined but later used. Guard and raise with guidance.

-try:
-    from vllm import LLM, SamplingParams
-except ImportError:
-    vllm = None
+try:
+    from vllm import LLM, SamplingParams
+except ImportError as _vllm_err:
+    LLM = None  # type: ignore[assignment]
+    SamplingParams = None  # type: ignore[assignment]
+    _VLLM_IMPORT_ERROR = _vllm_err

And in __init__:

     def __init__(
@@
     ):
+        if LLM is None or SamplingParams is None:
+            raise ImportError(
+                "vLLM is required for LongVila. Install with `pip install vllm`."
+            ) from _VLLM_IMPORT_ERROR
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
try:
from vllm import LLM, SamplingParams
except ImportError:
vllm = None
try:
from vllm import LLM, SamplingParams
except ImportError as _vllm_err:
LLM = None # type: ignore[assignment]
SamplingParams = None # type: ignore[assignment]
_VLLM_IMPORT_ERROR = _vllm_err
🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 16-20, the vllm import
fallback leaves SamplingParams undefined which will cause a crash later; modify
the import block to fail fast by raising an ImportError (or RuntimeError) with
an actionable message instructing to install vllm (and optionally provide pip
install instructions and mention SamplingParams/LLM are required), and update
the class __init__ to check that vllm and SamplingParams are available before
using them, raising the same clear error if they are missing so callers get
immediate, actionable feedback instead of a later NameError.

WORKERS = int(os.getenv("WORKERS", "32"))


@register_model("longvila")
class LongVila(VLLMSimple):
is_simple = False

def __init__(
self,
model="Efficient-Large-Model/LongVILA-R1-7B",
tensor_parallel_size=1,
data_parallel_size=1,
gpu_memory_utilization=0.5,
batch_size=1,
max_frame_num=32,
trust_remote_code=True,
chat_template=None,
max_pixels: int = 1605632,
min_image_pixels=28,
fps: Optional[int] = None,
device_map: Optional[str] = "cuda",
**kwargs,
):
# vLLM requires the path to the autoregressive llm weights under the model root
model_root = model
llm_path = os.path.join(model_root, "llm")
# Enable prompt embeddings so we can pass encoder-produced embeddings directly
Comment on lines +28 to +47
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Validate model paths and pass-through trust flag

Check that llm_path exists to fail early with a clear message; also forward trust_remote_code consistently.

         model_root = model
         llm_path = os.path.join(model_root, "llm")
+        if not os.path.isdir(llm_path):
+            raise FileNotFoundError(
+                f"Expected autoregressive LLM under '{llm_path}'. "
+                "Verify your LongVILA model layout."
+            )
@@
-        kwargs["enable_prompt_embeds"] = True
+        kwargs["enable_prompt_embeds"] = True
@@
-        super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs)
+        super().__init__(
+            llm_path,
+            tensor_parallel_size,
+            data_parallel_size,
+            gpu_memory_utilization,
+            batch_size,
+            max_frame_num,
+            trust_remote_code,
+            chat_template,
+            min_image_pixels,
+            **kwargs,
+        )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def __init__(
self,
model="Efficient-Large-Model/LongVILA-R1-7B",
tensor_parallel_size=1,
data_parallel_size=1,
gpu_memory_utilization=0.5,
batch_size=1,
max_frame_num=32,
trust_remote_code=True,
chat_template=None,
max_pixels: int = 1605632,
min_image_pixels=28,
fps: Optional[int] = None,
device_map: Optional[str] = "cuda",
**kwargs,
):
# vLLM requires the path to the autoregressive llm weights under the model root
model_root = model
llm_path = os.path.join(model_root, "llm")
# Enable prompt embeddings so we can pass encoder-produced embeddings directly
def __init__(
self,
model="Efficient-Large-Model/LongVILA-R1-7B",
tensor_parallel_size=1,
data_parallel_size=1,
gpu_memory_utilization=0.5,
batch_size=1,
max_frame_num=32,
trust_remote_code=True,
chat_template=None,
max_pixels: int = 1605632,
min_image_pixels=28,
fps: Optional[int] = None,
device_map: Optional[str] = "cuda",
**kwargs,
):
# vLLM requires the path to the autoregressive llm weights under the model root
model_root = model
llm_path = os.path.join(model_root, "llm")
if not os.path.isdir(llm_path):
raise FileNotFoundError(
f"Expected autoregressive LLM under '{llm_path}'. "
"Verify your LongVILA model layout."
)
# Enable prompt embeddings so we can pass encoder-produced embeddings directly
kwargs["enable_prompt_embeds"] = True
super().__init__(
llm_path,
tensor_parallel_size,
data_parallel_size,
gpu_memory_utilization,
batch_size,
max_frame_num,
trust_remote_code,
chat_template,
min_image_pixels,
**kwargs,
)
🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 28 to 47, add an explicit
existence check for the computed llm_path and raise a clear FileNotFoundError if
it doesn't exist so the class fails fast with a helpful message; also ensure the
incoming trust_remote_code flag is stored on self (e.g., self.trust_remote_code
= trust_remote_code) and passed through to any downstream model-loading calls
that accept trust_remote_code so the same behavior is consistently applied.

kwargs["enable_prompt_embeds"] = True
self.fps = fps
self.max_pixels = max_pixels

# Set up imports from the model's remote_code directory
# The LongVILA repo provides preprocessing utilities we must call directly
try:
from remote_code.media import extract_media as _extract_media
from remote_code.mm_utils import process_images as _process_images
from remote_code.tokenizer_utils import (
tokenize_conversation as _tokenize_conversation,
)
except Exception as e:
raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")

self.extract_media = _extract_media
self.process_images = _process_images
self.tokenize_conversation = _tokenize_conversation

Comment on lines +52 to +66
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

remote_code import path will fail unless it’s on sys.path

The code imports remote_code.* without adding model_root to sys.path. Load explicitly from the model folder to avoid relying on a repo-level remote_code/.

-        try:
-            from remote_code.media import extract_media as _extract_media
-            from remote_code.mm_utils import process_images as _process_images
-            from remote_code.tokenizer_utils import (
-                tokenize_conversation as _tokenize_conversation,
-            )
-        except Exception as e:
-            raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
+        try:
+            remote_code_dir = os.path.join(model_root)
+            if remote_code_dir not in sys.path:
+                sys.path.insert(0, remote_code_dir)
+            from remote_code.media import extract_media as _extract_media  # type: ignore
+            from remote_code.mm_utils import process_images as _process_images  # type: ignore
+            from remote_code.tokenizer_utils import (  # type: ignore
+                tokenize_conversation as _tokenize_conversation,
+            )
+        except Exception as err:
+            raise ImportError(
+                f"Failed to import LongVILA remote_code from '{model_root}'. "
+                "Ensure the model path contains a 'remote_code/' package."
+            ) from err
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Set up imports from the model's remote_code directory
# The LongVILA repo provides preprocessing utilities we must call directly
try:
from remote_code.media import extract_media as _extract_media
from remote_code.mm_utils import process_images as _process_images
from remote_code.tokenizer_utils import (
tokenize_conversation as _tokenize_conversation,
)
except Exception as e:
raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
self.extract_media = _extract_media
self.process_images = _process_images
self.tokenize_conversation = _tokenize_conversation
# Set up imports from the model's remote_code directory
# The LongVILA repo provides preprocessing utilities we must call directly
try:
remote_code_dir = os.path.join(model_root)
if remote_code_dir not in sys.path:
sys.path.insert(0, remote_code_dir)
from remote_code.media import extract_media as _extract_media # type: ignore
from remote_code.mm_utils import process_images as _process_images # type: ignore
from remote_code.tokenizer_utils import ( # type: ignore
tokenize_conversation as _tokenize_conversation,
)
except Exception as err:
raise ImportError(
f"Failed to import LongVILA remote_code from '{model_root}'. "
"Ensure the model path contains a 'remote_code/' package."
) from err
self.extract_media = _extract_media
self.process_images = _process_images
self.tokenize_conversation = _tokenize_conversation
🧰 Tools
🪛 Ruff (0.12.2)

60-60: Do not catch blind exception: Exception

(BLE001)


61-61: Within an except clause, raise exceptions with raise ... from err or raise ... from None to distinguish them from errors in exception handling

(B904)


61-61: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 52 to 66, the imports from
remote_code will fail unless model_root is on sys.path; prepend the absolute
model_root to sys.path before the try block (and optionally pop it afterward) so
Python can import remote_code from the model folder, ensure model_root exists
and is resolved with os.path.abspath, and keep the existing try/except and
attribute assignments intact.

# Load the encoder that produces prompt embeddings for the LLM
# llm_only_need_embed reduces memory usage to only what's needed for embedding
self.model_encoder = AutoModel.from_pretrained(
model_root,
trust_remote_code=True,
device_map=device_map,
llm_only_need_embed=True,
)
super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs)

def _to_remote_conversation(self, chat_messages: ChatMessages) -> list:
"""
Convert ChatMessages to LongVILA remote_code conversation format.
[{"from": "human"|"gpt", "value": [str | {"path": media_path}, ...]}, ...]
"""
role_map = {"user": "human", "assistant": "gpt", "system": "human"}
conversation = []
for msg in chat_messages.messages:
from_role = role_map.get(msg.role, "human")
value_parts = []
for content in msg.content:
# ChatTextContent
if getattr(content, "type", None) == "text":
value_parts.append(content.text)
# Images, Videos, Audios -> use path dicts as required by tokenizer_utils
elif getattr(content, "type", None) in ("image", "video", "audio"):
value_parts.append({"path": content.url})
if value_parts:
conversation.append({"from": from_role, "value": value_parts})
return conversation

def make_one_request(self, request: Instance) -> Tuple["object", dict]:
"""
Build prompt embeddings and per-request sampling params from an Instance.
Returns (inputs_embeds, params_dict). Does not mutate input.
"""
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages = ChatMessages(messages=raw_messages)

# Copy to avoid side-effects across threads
_gen = dict(gen_kwargs or {})
_gen.setdefault("max_new_tokens", 4096)
_gen.setdefault("temperature", 0)
_gen.setdefault("top_p", 0.95)

params = {
"temperature": _gen["temperature"],
"max_tokens": _gen["max_new_tokens"],
"top_p": _gen["top_p"],
}

# Convert to LongVILA remote_code conversation format
conversation = self._to_remote_conversation(chat_messages)

# Extract and preprocess media
if self.fps:
self.model_encoder.config.fps = self.fps
else:
self.model_encoder.config.num_video_frames = self.max_frame_num
self.model_encoder.config.fps = 0
media = self.extract_media(conversation, self.model_encoder.config)
if "video" in media and media["video"] is not None:
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]

# Tokenize conversation and move to CUDA for embedding
input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()

# Create prompt embeddings using the model encoder
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)

return inputs_embeds, params

Comment on lines +98 to +139
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Device placement: avoid unconditional .cuda()

Calling .cuda() breaks on CPU-only setups or non-CUDA devices. Derive device from the encoder.

-        input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
+        input_ids = (
+            self.tokenize_conversation(
+                conversation,
+                self.model_encoder.tokenizer,
+                add_generation_prompt=True,
+            )
+            .unsqueeze(0)
+            .to(next(self.model_encoder.parameters()).device)
+        )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def make_one_request(self, request: Instance) -> Tuple["object", dict]:
"""
Build prompt embeddings and per-request sampling params from an Instance.
Returns (inputs_embeds, params_dict). Does not mutate input.
"""
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages = ChatMessages(messages=raw_messages)
# Copy to avoid side-effects across threads
_gen = dict(gen_kwargs or {})
_gen.setdefault("max_new_tokens", 4096)
_gen.setdefault("temperature", 0)
_gen.setdefault("top_p", 0.95)
params = {
"temperature": _gen["temperature"],
"max_tokens": _gen["max_new_tokens"],
"top_p": _gen["top_p"],
}
# Convert to LongVILA remote_code conversation format
conversation = self._to_remote_conversation(chat_messages)
# Extract and preprocess media
if self.fps:
self.model_encoder.config.fps = self.fps
else:
self.model_encoder.config.num_video_frames = self.max_frame_num
self.model_encoder.config.fps = 0
media = self.extract_media(conversation, self.model_encoder.config)
if "video" in media and media["video"] is not None:
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
# Tokenize conversation and move to CUDA for embedding
input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
# Create prompt embeddings using the model encoder
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
return inputs_embeds, params
def make_one_request(self, request: Instance) -> Tuple["object", dict]:
"""
Build prompt embeddings and per-request sampling params from an Instance.
Returns (inputs_embeds, params_dict). Does not mutate input.
"""
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages = ChatMessages(messages=raw_messages)
# Copy to avoid side-effects across threads
_gen = dict(gen_kwargs or {})
_gen.setdefault("max_new_tokens", 4096)
_gen.setdefault("temperature", 0)
_gen.setdefault("top_p", 0.95)
params = {
"temperature": _gen["temperature"],
"max_tokens": _gen["max_new_tokens"],
"top_p": _gen["top_p"],
}
# Convert to LongVILA remote_code conversation format
conversation = self._to_remote_conversation(chat_messages)
# Extract and preprocess media
if self.fps:
self.model_encoder.config.fps = self.fps
else:
self.model_encoder.config.num_video_frames = self.max_frame_num
self.model_encoder.config.fps = 0
media = self.extract_media(conversation, self.model_encoder.config)
if "video" in media and media["video"] is not None:
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
# Tokenize conversation and move to CUDA for embedding
input_ids = (
self.tokenize_conversation(
conversation,
self.model_encoder.tokenizer,
add_generation_prompt=True,
)
.unsqueeze(0)
.to(next(self.model_encoder.parameters()).device)
)
# Create prompt embeddings using the model encoder
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
return inputs_embeds, params

def generate_until(self, requests) -> List[str]:
res = []
self.load_cache()
res, requests = self.get_response_from_cache(requests)
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

batch_size = self.batch_size_per_gpu
batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)]
e2e_latency = 0
for batch_requests in batched_requests:
prompt_embeds_list = []
params_list = []
# Build embeddings sequentially to avoid GPU contention in the encoder
for req in tqdm(batch_requests, disable=(self.rank != 0), desc="Building embeddings"):
inputs_embeds, params = self.make_one_request(req)
prompt_embeds_list.append({"prompt_embeds": inputs_embeds.squeeze(0)})
params_list.append(params)

# For now, assume homogeneous sampling params within a batch
sampling_params = SamplingParams(**params_list[-1])

start_time = time.time()
response = self.client.generate(prompts=prompt_embeds_list, sampling_params=sampling_params)
end_time = time.time()

response_text = [o.outputs[0].text for o in response]
for req, text in zip(batch_requests, response_text):
self.add_request_response_to_cache(req, text)

# Calculate timing metrics for batch
e2e_latency += end_time - start_time

assert len(response_text) == len(batch_requests)
res.extend(response_text)
pbar.update(len(batch_requests))

pbar.close()
return res

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4V not support"

def generate_until_multi_round(self, requests) -> List[str]:
raise NotImplementedError("TODO: Implement multi-round generation")
Comment on lines +179 to +184
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Replace assert False and remove unused arg warning

Use a proper exception and underscore the unused parameter.

-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        # TODO
-        assert False, "GPT4V not support"
+    def loglikelihood(self, _requests: List[Instance]) -> List[Tuple[float, bool]]:  # noqa: ARG002
+        raise NotImplementedError("LongVila does not support loglikelihood.")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4V not support"
def generate_until_multi_round(self, requests) -> List[str]:
raise NotImplementedError("TODO: Implement multi-round generation")
def loglikelihood(self, _requests: List[Instance]) -> List[Tuple[float, bool]]: # noqa: ARG002
raise NotImplementedError("LongVila does not support loglikelihood.")
def generate_until_multi_round(self, requests) -> List[str]:
raise NotImplementedError("TODO: Implement multi-round generation")
🧰 Tools
🪛 Ruff (0.12.2)

179-179: Unused method argument: requests

(ARG002)


181-181: Do not assert False (python -O removes these calls), raise AssertionError()

Replace assert False

(B011)

🤖 Prompt for AI Agents
In lmms_eval/models/chat/longvila.py around lines 179 to 184, replace the assert
False with a proper exception and silence the unused-parameter warning by
renaming the argument to _requests: implement loglikelihood to raise
NotImplementedError("GPT4V not supported") instead of asserting, and change
generate_until_multi_round(self, requests) to generate_until_multi_round(self,
_requests) (or similarly prefix the parameter with an underscore) and keep its
NotImplementedError; this removes the assert and the unused-arg warning while
clearly signaling unimplemented functionality.

12 changes: 11 additions & 1 deletion lmms_eval/tasks/charades_sta/eval_tvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def extract_time(paragraph):

timestamps = []
# Check for The given query happens in m - n (seconds)
patterns = [r"(\d+\.*\d*)\s*-\s*(\d+\.*\d*)"]
patterns = [r"(\d+\.*\d*)\s*[–-]\s*(\d+\.*\d*)"]

for time_pattern in patterns:
time_matches = re.findall(time_pattern, paragraph)
Expand Down Expand Up @@ -84,6 +84,16 @@ def extract_time(paragraph):
times.append(time_in_sec)
times = times[: len(times) // 2 * 2]
timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
# Fallback: if no timestamps found, search for any two number patterns with dash
if len(timestamps) == 0:
# More comprehensive pattern to match various formats like:
# xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc.
# Also handle en dash (–) and regular dash (-)
fallback_pattern = r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?"
fallback_matches = re.findall(fallback_pattern, paragraph)
if fallback_matches:
timestamps = [[float(start), float(end)] for start, end in fallback_matches]

Comment on lines +87 to +96
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fallback regex: remove literal en dash; reduce false positives; handle units.

  • Replace the literal EN DASH with \u2013 to avoid Ruff RUF001/RUF003.
  • Limit search scope to keyword-filtered candidates to avoid catching unrelated numeric ranges in the whole paragraph.
  • Optionally accept "s/sec/seconds".

Apply this diff:

-    if len(timestamps) == 0:
-        # More comprehensive pattern to match various formats like:
-        # xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc.
-        # Also handle en dash (–) and regular dash (-)
-        fallback_pattern = r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?"
-        fallback_matches = re.findall(fallback_pattern, paragraph)
+    if len(timestamps) == 0:
+        # More comprehensive pattern to match various formats like:
+        # xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc.
+        # Also handle hyphen-minus (-) and EN DASH (U+2013)
+        fallback_pattern = (
+            r"(\d+(?:\.\d+)?)\s*(?:s|sec|secs|second|seconds)?\s*"
+            r"[\u2013-]\s*"
+            r"(\d+(?:\.\d+)?)\s*(?:s|sec|secs|second|seconds)?\b"
+        )
+        scope = ' '.join(candidates) if candidates else paragraph
+        fallback_matches = re.findall(fallback_pattern, scope)

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.12.2)

91-91: Comment contains ambiguous (EN DASH). Did you mean - (HYPHEN-MINUS)?

(RUF003)


92-92: String contains ambiguous (EN DASH). Did you mean - (HYPHEN-MINUS)?

(RUF001)

results = []
for start, end in timestamps:
if end > start:
Expand Down
28 changes: 28 additions & 0 deletions lmms_eval/tasks/lvbench/lvbench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
dataset_path: lmms-lab/LVBench
dataset_kwargs:
token: True
cache_dir: lvbench
video: True
# From_YouTube: True
test_split: train
task: lvbench
output_type: generate_until
doc_to_visual: !function utils.lvbench_doc_to_visual
doc_to_text: !function utils.lvbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
# The return value of process_results will be used by metrics
process_results: !function utils.lvbench_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: lvbench_score
aggregation: mean
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer the question with the option letter"
metadata:
- version: 0.0

74 changes: 74 additions & 0 deletions lmms_eval/tasks/lvbench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import re
from pathlib import Path

import yaml

hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
Comment on lines +7 to +16
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Harden YAML parsing and fix unused loop var

  • Rename unused i to _i (Ruff B007).
  • Add defensive checks around YAML keys to avoid KeyError if the schema changes.

Apply:

-raw_data = f.readlines()
-safe_data = []
-for i, line in enumerate(raw_data):
+raw_data = f.readlines()
+safe_data = []
+for _i, line in enumerate(raw_data):
     # remove function definition since yaml load cannot handle it
     if "!function" not in line:
         safe_data.append(line)
-cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+data = yaml.safe_load("".join(safe_data)) or {}
+try:
+    cache_name = data["dataset_kwargs"]["cache_dir"]
+except KeyError as exc:
+    raise KeyError("Expected 'dataset_kwargs.cache_dir' in lvbench.yaml") from exc
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for _i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
data = yaml.safe_load("".join(safe_data)) or {}
try:
cache_name = data["dataset_kwargs"]["cache_dir"]
except KeyError as exc:
raise KeyError("Expected 'dataset_kwargs.cache_dir' in lvbench.yaml") from exc
🧰 Tools
🪛 Ruff (0.12.2)

12-12: Loop control variable i not used within loop body

Rename unused i to _i

(B007)

🤖 Prompt for AI Agents
In lmms_eval/tasks/lvbench/utils.py around lines 7 to 16, the loop uses an
unused variable `i` and the YAML access assumes keys exist which can raise
KeyError; rename `i` to `_i` to satisfy Ruff B007 and add defensive checks when
loading the YAML (e.g., check that the top-level mapping exists and contains
"dataset_kwargs" and "cache_dir" keys, raising a clear error or falling back to
a default) before assigning cache_name so the code won't crash if the schema
changes.



def lvbench_doc_to_visual(doc):
cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = doc["video_path"]
assert os.path.exists(os.path.join(cache_dir, video_path))
video_path = os.path.join(cache_dir, video_path)
return [video_path]
Comment on lines +19 to +24
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid asserts for I/O; add types and clearer errors

Use Path checks and raise FileNotFoundError. Add type hints.

-def lvbench_doc_to_visual(doc):
-    cache_dir = os.path.join(base_cache_dir, cache_name)
-    video_path = doc["video_path"]
-    assert os.path.exists(os.path.join(cache_dir, video_path))
-    video_path = os.path.join(cache_dir, video_path)
-    return [video_path]
+from typing import Dict, List
+
+def lvbench_doc_to_visual(doc: Dict) -> List[str]:
+    cache_dir = Path(base_cache_dir) / cache_name
+    video_path = Path(doc["video_path"])
+    abs_path = video_path if video_path.is_absolute() else (cache_dir / video_path)
+    if not abs_path.exists():
+        raise FileNotFoundError(f"Video not found: {abs_path}")
+    return [str(abs_path)]

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In lmms_eval/tasks/lvbench/utils.py around lines 19 to 24, the function uses an
assert for an I/O check and lacks type hints; replace the assert with
pathlib.Path checks and raise FileNotFoundError with a clear message if the file
is missing, add type hints (e.g., doc: Mapping[str, Any] -> List[Path]) on the
function signature, construct video_path as a Path using
base_cache_dir/cache_name and doc["video_path"], and return a list of Path
objects; ensure pathlib.Path is imported and adjust any callers if they expect
strings.



def lvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}
if "pre_prompt" not in lmms_eval_specific_kwargs:
lmms_eval_specific_kwargs["pre_prompt"] = ""
if "post_prompt" not in lmms_eval_specific_kwargs:
lmms_eval_specific_kwargs["post_prompt"] = "\nAnswer the question with the option letter"
return lmms_eval_specific_kwargs["pre_prompt"] + doc["question"] + lmms_eval_specific_kwargs["post_prompt"]


def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is" "The correct option is",
"Best answer:" "Best option:",
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, "")

if len(s.split()) > 10 and not re.search("[ABCD]", s):
return ""

matches = re.search(r"[ABCD]", s)
if matches is None:
return ""
return matches[0]
Comment on lines +37 to +56
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix concatenated strings bug and improve extraction robustness

Missing commas are concatenating literals (e.g., "Best answer:" "Best option:"). Also prefer case-insensitive matching and strip punctuation.

-def extract_characters_regex(s):
+from typing import Optional
+def extract_characters_regex(s: str) -> str:
     s = s.strip()
     answer_prefixes = [
         "The best answer is",
         "The correct answer is",
         "The answer is",
         "The answer",
-        "The best option is" "The correct option is",
-        "Best answer:" "Best option:",
+        "The best option is",
+        "The correct option is",
+        "Best answer:",
+        "Best option:",
     ]
     for answer_prefix in answer_prefixes:
-        s = s.replace(answer_prefix, "")
+        s = re.sub(re.escape(answer_prefix), "", s, flags=re.IGNORECASE).strip()
 
-    if len(s.split()) > 10 and not re.search("[ABCD]", s):
+    if len(s.split()) > 10 and not re.search(r"\b[ABCD]\b", s, flags=re.I):
         return ""
 
-    matches = re.search(r"[ABCD]", s)
+    matches = re.search(r"\b([ABCD])\b", s, flags=re.I)
     if matches is None:
         return ""
-    return matches[0]
+    return matches.group(1).upper()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is" "The correct option is",
"Best answer:" "Best option:",
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, "")
if len(s.split()) > 10 and not re.search("[ABCD]", s):
return ""
matches = re.search(r"[ABCD]", s)
if matches is None:
return ""
return matches[0]
from typing import Optional
def extract_characters_regex(s: str) -> str:
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is",
"The correct option is",
"Best answer:",
"Best option:",
]
for answer_prefix in answer_prefixes:
s = re.sub(re.escape(answer_prefix), "", s, flags=re.IGNORECASE).strip()
if len(s.split()) > 10 and not re.search(r"\b[ABCD]\b", s, flags=re.I):
return ""
matches = re.search(r"\b([ABCD])\b", s, flags=re.I)
if matches is None:
return ""
return matches.group(1).upper()
🤖 Prompt for AI Agents
In lmms_eval/tasks/lvbench/utils.py around lines 37 to 56, the answer_prefix
list has missing commas causing string concatenation and the character
extraction is brittle; fix by inserting the missing commas between list
literals, perform prefix removal and then normalize s (lower/upper consistently)
and strip surrounding punctuation, use a case-insensitive regex (e.g.,
re.search(r"\b[ABCD]\b", s, flags=re.IGNORECASE) or re.search(r"[ABCD]", s,
flags=re.IGNORECASE)) to find the first A/B/C/D, and return the uppercase
character (or "" if none); also keep the existing short-word-length guard but
apply it after normalization.



def lvbench_process_results(doc, results):
"""
Args:
doc: a instance of the eval dataset
results: [pred]
Returns:
a dictionary with key: metric name (in this case videomme score), value: metric value
"""
pred = results[0]
pred_ans = extract_characters_regex(pred)
# gt_ans = doc["answer"].lower().strip().replace(".", "")
gt_ans = doc["answer"]
score = pred_ans == gt_ans

# return {f"videomme_perception_score": data_dict for metric in matrices}
return {f"lvbench_score": score}