diff --git a/.gitignore b/.gitignore
index 997f8f228..99e488d9e 100755
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,4 @@ span.log
uv.lock
workspace/*
.claude/*
+remote_code/*
diff --git a/lmms_eval/loggers/evaluation_tracker.py b/lmms_eval/loggers/evaluation_tracker.py
index 115feb2d3..65936cdf9 100644
--- a/lmms_eval/loggers/evaluation_tracker.py
+++ b/lmms_eval/loggers/evaluation_tracker.py
@@ -68,7 +68,7 @@ def extract_model_name(model_args: str, key: str) -> str:
return args_after_key.split(",")[0]
# order does matter, e.g. peft and delta are provided together with pretrained
- prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
+ prefixes = ["peft=", "delta=", "pretrained=", "model=", "model_version=", "model_name=", "model_id=", "path=", "engine="]
for prefix in prefixes:
if prefix in model_args:
return extract_model_name(model_args, prefix)
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index 0638c56ee..ad45a0d9a 100644
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -88,6 +88,7 @@
"sglang": "Sglang",
"huggingface": "Huggingface",
"async_openai": "AsyncOpenAIChat",
+ "longvila": "LongVila",
}
diff --git a/lmms_eval/models/chat/async_openai.py b/lmms_eval/models/chat/async_openai.py
index fb258c21e..b9b0ea788 100644
--- a/lmms_eval/models/chat/async_openai.py
+++ b/lmms_eval/models/chat/async_openai.py
@@ -184,11 +184,17 @@ async def maybe_forward_with_tool(self, request: Instance, idx: int):
for call in message.tool_calls:
eval_logger.debug(f"Calling {call.function.name}...")
result = await self.mcp_client.run_tool(call.function.name, eval(call.function.arguments))
- all_response += f"{call.function.name} {call.function.arguments}"
+ all_response += f"{call.function.name} {call.function.arguments}"
tool_messages.append({"role": "tool", "name": call.function.name, "content": []})
for content in result.content:
tool_message = self.mcp_client.convert_result_to_openai_format(content)
+ for content in tool_message:
+ if content["type"] == "image_url":
+ all_response += ""
+ elif content["type"] == "text":
+ all_response += content["text"]
tool_messages[-1]["content"].extend(tool_message)
+ all_response += ""
response = await self.client.chat.completions.create(
model=self.model_version,
diff --git a/lmms_eval/models/chat/longvila.py b/lmms_eval/models/chat/longvila.py
new file mode 100644
index 000000000..b1c8d77d3
--- /dev/null
+++ b/lmms_eval/models/chat/longvila.py
@@ -0,0 +1,184 @@
+import os
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Optional, Tuple, Union
+
+from tqdm import tqdm
+from transformers import AutoModel
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.model_utils.gen_metrics import log_metrics
+from lmms_eval.models.simple.vllm import VLLM as VLLMSimple
+from lmms_eval.protocol import ChatMessages
+
+try:
+ from vllm import LLM, SamplingParams
+except ImportError:
+ vllm = None
+
+WORKERS = int(os.getenv("WORKERS", "32"))
+
+
+@register_model("longvila")
+class LongVila(VLLMSimple):
+ is_simple = False
+
+ def __init__(
+ self,
+ model="Efficient-Large-Model/LongVILA-R1-7B",
+ tensor_parallel_size=1,
+ data_parallel_size=1,
+ gpu_memory_utilization=0.5,
+ batch_size=1,
+ max_frame_num=32,
+ trust_remote_code=True,
+ chat_template=None,
+ max_pixels: int = 1605632,
+ min_image_pixels=28,
+ fps: Optional[int] = None,
+ device_map: Optional[str] = "cuda",
+ **kwargs,
+ ):
+ # vLLM requires the path to the autoregressive llm weights under the model root
+ model_root = model
+ llm_path = os.path.join(model_root, "llm")
+ # Enable prompt embeddings so we can pass encoder-produced embeddings directly
+ kwargs["enable_prompt_embeds"] = True
+ self.fps = fps
+ self.max_pixels = max_pixels
+
+ # Set up imports from the model's remote_code directory
+ # The LongVILA repo provides preprocessing utilities we must call directly
+ try:
+ from remote_code.media import extract_media as _extract_media
+ from remote_code.mm_utils import process_images as _process_images
+ from remote_code.tokenizer_utils import (
+ tokenize_conversation as _tokenize_conversation,
+ )
+ except Exception as e:
+ raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
+
+ self.extract_media = _extract_media
+ self.process_images = _process_images
+ self.tokenize_conversation = _tokenize_conversation
+
+ # Load the encoder that produces prompt embeddings for the LLM
+ # llm_only_need_embed reduces memory usage to only what's needed for embedding
+ self.model_encoder = AutoModel.from_pretrained(
+ model_root,
+ trust_remote_code=True,
+ device_map=device_map,
+ llm_only_need_embed=True,
+ )
+ super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs)
+
+ def _to_remote_conversation(self, chat_messages: ChatMessages) -> list:
+ """
+ Convert ChatMessages to LongVILA remote_code conversation format.
+ [{"from": "human"|"gpt", "value": [str | {"path": media_path}, ...]}, ...]
+ """
+ role_map = {"user": "human", "assistant": "gpt", "system": "human"}
+ conversation = []
+ for msg in chat_messages.messages:
+ from_role = role_map.get(msg.role, "human")
+ value_parts = []
+ for content in msg.content:
+ # ChatTextContent
+ if getattr(content, "type", None) == "text":
+ value_parts.append(content.text)
+ # Images, Videos, Audios -> use path dicts as required by tokenizer_utils
+ elif getattr(content, "type", None) in ("image", "video", "audio"):
+ value_parts.append({"path": content.url})
+ if value_parts:
+ conversation.append({"from": from_role, "value": value_parts})
+ return conversation
+
+ def make_one_request(self, request: Instance) -> Tuple["object", dict]:
+ """
+ Build prompt embeddings and per-request sampling params from an Instance.
+ Returns (inputs_embeds, params_dict). Does not mutate input.
+ """
+ ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
+ raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
+ chat_messages = ChatMessages(messages=raw_messages)
+
+ # Copy to avoid side-effects across threads
+ _gen = dict(gen_kwargs or {})
+ _gen.setdefault("max_new_tokens", 4096)
+ _gen.setdefault("temperature", 0)
+ _gen.setdefault("top_p", 0.95)
+
+ params = {
+ "temperature": _gen["temperature"],
+ "max_tokens": _gen["max_new_tokens"],
+ "top_p": _gen["top_p"],
+ }
+
+ # Convert to LongVILA remote_code conversation format
+ conversation = self._to_remote_conversation(chat_messages)
+
+ # Extract and preprocess media
+ if self.fps:
+ self.model_encoder.config.fps = self.fps
+ else:
+ self.model_encoder.config.num_video_frames = self.max_frame_num
+ self.model_encoder.config.fps = 0
+ media = self.extract_media(conversation, self.model_encoder.config)
+ if "video" in media and media["video"] is not None:
+ media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
+
+ # Tokenize conversation and move to CUDA for embedding
+ input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
+
+ # Create prompt embeddings using the model encoder
+ inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
+
+ return inputs_embeds, params
+
+ def generate_until(self, requests) -> List[str]:
+ res = []
+ self.load_cache()
+ res, requests = self.get_response_from_cache(requests)
+ pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+ batch_size = self.batch_size_per_gpu
+ batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)]
+ e2e_latency = 0
+ for batch_requests in batched_requests:
+ prompt_embeds_list = []
+ params_list = []
+ # Build embeddings sequentially to avoid GPU contention in the encoder
+ for req in tqdm(batch_requests, disable=(self.rank != 0), desc="Building embeddings"):
+ inputs_embeds, params = self.make_one_request(req)
+ prompt_embeds_list.append({"prompt_embeds": inputs_embeds.squeeze(0)})
+ params_list.append(params)
+
+ # For now, assume homogeneous sampling params within a batch
+ sampling_params = SamplingParams(**params_list[-1])
+
+ start_time = time.time()
+ response = self.client.generate(prompts=prompt_embeds_list, sampling_params=sampling_params)
+ end_time = time.time()
+
+ response_text = [o.outputs[0].text for o in response]
+ for req, text in zip(batch_requests, response_text):
+ self.add_request_response_to_cache(req, text)
+
+ # Calculate timing metrics for batch
+ e2e_latency += end_time - start_time
+
+ assert len(response_text) == len(batch_requests)
+ res.extend(response_text)
+ pbar.update(len(batch_requests))
+
+ pbar.close()
+ return res
+
+ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+ # TODO
+ assert False, "GPT4V not support"
+
+ def generate_until_multi_round(self, requests) -> List[str]:
+ raise NotImplementedError("TODO: Implement multi-round generation")
diff --git a/lmms_eval/tasks/charades_sta/eval_tvg.py b/lmms_eval/tasks/charades_sta/eval_tvg.py
index 6a43841fe..4689ad6de 100644
--- a/lmms_eval/tasks/charades_sta/eval_tvg.py
+++ b/lmms_eval/tasks/charades_sta/eval_tvg.py
@@ -40,7 +40,7 @@ def extract_time(paragraph):
timestamps = []
# Check for The given query happens in m - n (seconds)
- patterns = [r"(\d+\.*\d*)\s*-\s*(\d+\.*\d*)"]
+ patterns = [r"(\d+\.*\d*)\s*[–-]\s*(\d+\.*\d*)"]
for time_pattern in patterns:
time_matches = re.findall(time_pattern, paragraph)
@@ -84,6 +84,16 @@ def extract_time(paragraph):
times.append(time_in_sec)
times = times[: len(times) // 2 * 2]
timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
+ # Fallback: if no timestamps found, search for any two number patterns with dash
+ if len(timestamps) == 0:
+ # More comprehensive pattern to match various formats like:
+ # xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc.
+ # Also handle en dash (–) and regular dash (-)
+ fallback_pattern = r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?"
+ fallback_matches = re.findall(fallback_pattern, paragraph)
+ if fallback_matches:
+ timestamps = [[float(start), float(end)] for start, end in fallback_matches]
+
results = []
for start, end in timestamps:
if end > start:
diff --git a/lmms_eval/tasks/lvbench/lvbench.yaml b/lmms_eval/tasks/lvbench/lvbench.yaml
new file mode 100644
index 000000000..58492f84d
--- /dev/null
+++ b/lmms_eval/tasks/lvbench/lvbench.yaml
@@ -0,0 +1,28 @@
+dataset_path: lmms-lab/LVBench
+dataset_kwargs:
+ token: True
+ cache_dir: lvbench
+ video: True
+ # From_YouTube: True
+test_split: train
+task: lvbench
+output_type: generate_until
+doc_to_visual: !function utils.lvbench_doc_to_visual
+doc_to_text: !function utils.lvbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+ max_new_tokens: 16
+# The return value of process_results will be used by metrics
+process_results: !function utils.lvbench_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+ - metric: lvbench_score
+ aggregation: mean
+ higher_is_better: true
+lmms_eval_specific_kwargs:
+ default:
+ pre_prompt: ""
+ post_prompt: "\nAnswer the question with the option letter"
+metadata:
+ - version: 0.0
+
diff --git a/lmms_eval/tasks/lvbench/utils.py b/lmms_eval/tasks/lvbench/utils.py
new file mode 100644
index 000000000..8b16fd125
--- /dev/null
+++ b/lmms_eval/tasks/lvbench/utils.py
@@ -0,0 +1,74 @@
+import os
+import re
+from pathlib import Path
+
+import yaml
+
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
+ raw_data = f.readlines()
+ safe_data = []
+ for i, line in enumerate(raw_data):
+ # remove function definition since yaml load cannot handle it
+ if "!function" not in line:
+ safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+def lvbench_doc_to_visual(doc):
+ cache_dir = os.path.join(base_cache_dir, cache_name)
+ video_path = doc["video_path"]
+ assert os.path.exists(os.path.join(cache_dir, video_path))
+ video_path = os.path.join(cache_dir, video_path)
+ return [video_path]
+
+
+def lvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+ if lmms_eval_specific_kwargs is None:
+ lmms_eval_specific_kwargs = {}
+ if "pre_prompt" not in lmms_eval_specific_kwargs:
+ lmms_eval_specific_kwargs["pre_prompt"] = ""
+ if "post_prompt" not in lmms_eval_specific_kwargs:
+ lmms_eval_specific_kwargs["post_prompt"] = "\nAnswer the question with the option letter"
+ return lmms_eval_specific_kwargs["pre_prompt"] + doc["question"] + lmms_eval_specific_kwargs["post_prompt"]
+
+
+def extract_characters_regex(s):
+ s = s.strip()
+ answer_prefixes = [
+ "The best answer is",
+ "The correct answer is",
+ "The answer is",
+ "The answer",
+ "The best option is" "The correct option is",
+ "Best answer:" "Best option:",
+ ]
+ for answer_prefix in answer_prefixes:
+ s = s.replace(answer_prefix, "")
+
+ if len(s.split()) > 10 and not re.search("[ABCD]", s):
+ return ""
+
+ matches = re.search(r"[ABCD]", s)
+ if matches is None:
+ return ""
+ return matches[0]
+
+
+def lvbench_process_results(doc, results):
+ """
+ Args:
+ doc: a instance of the eval dataset
+ results: [pred]
+ Returns:
+ a dictionary with key: metric name (in this case videomme score), value: metric value
+ """
+ pred = results[0]
+ pred_ans = extract_characters_regex(pred)
+ # gt_ans = doc["answer"].lower().strip().replace(".", "")
+ gt_ans = doc["answer"]
+ score = pred_ans == gt_ans
+
+ # return {f"videomme_perception_score": data_dict for metric in matrices}
+ return {f"lvbench_score": score}