diff --git a/.gitignore b/.gitignore index 997f8f228..99e488d9e 100755 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ span.log uv.lock workspace/* .claude/* +remote_code/* diff --git a/lmms_eval/loggers/evaluation_tracker.py b/lmms_eval/loggers/evaluation_tracker.py index 115feb2d3..65936cdf9 100644 --- a/lmms_eval/loggers/evaluation_tracker.py +++ b/lmms_eval/loggers/evaluation_tracker.py @@ -68,7 +68,7 @@ def extract_model_name(model_args: str, key: str) -> str: return args_after_key.split(",")[0] # order does matter, e.g. peft and delta are provided together with pretrained - prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="] + prefixes = ["peft=", "delta=", "pretrained=", "model=", "model_version=", "model_name=", "model_id=", "path=", "engine="] for prefix in prefixes: if prefix in model_args: return extract_model_name(model_args, prefix) diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index 0638c56ee..ad45a0d9a 100644 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -88,6 +88,7 @@ "sglang": "Sglang", "huggingface": "Huggingface", "async_openai": "AsyncOpenAIChat", + "longvila": "LongVila", } diff --git a/lmms_eval/models/chat/async_openai.py b/lmms_eval/models/chat/async_openai.py index fb258c21e..b9b0ea788 100644 --- a/lmms_eval/models/chat/async_openai.py +++ b/lmms_eval/models/chat/async_openai.py @@ -184,11 +184,17 @@ async def maybe_forward_with_tool(self, request: Instance, idx: int): for call in message.tool_calls: eval_logger.debug(f"Calling {call.function.name}...") result = await self.mcp_client.run_tool(call.function.name, eval(call.function.arguments)) - all_response += f"{call.function.name} {call.function.arguments}" + all_response += f"{call.function.name} {call.function.arguments}" tool_messages.append({"role": "tool", "name": call.function.name, "content": []}) for content in result.content: tool_message = self.mcp_client.convert_result_to_openai_format(content) + for content in tool_message: + if content["type"] == "image_url": + all_response += "" + elif content["type"] == "text": + all_response += content["text"] tool_messages[-1]["content"].extend(tool_message) + all_response += "" response = await self.client.chat.completions.create( model=self.model_version, diff --git a/lmms_eval/models/chat/longvila.py b/lmms_eval/models/chat/longvila.py new file mode 100644 index 000000000..b1c8d77d3 --- /dev/null +++ b/lmms_eval/models/chat/longvila.py @@ -0,0 +1,184 @@ +import os +import sys +import time +from concurrent.futures import ThreadPoolExecutor +from typing import List, Optional, Tuple, Union + +from tqdm import tqdm +from transformers import AutoModel + +from lmms_eval.api.instance import Instance +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.gen_metrics import log_metrics +from lmms_eval.models.simple.vllm import VLLM as VLLMSimple +from lmms_eval.protocol import ChatMessages + +try: + from vllm import LLM, SamplingParams +except ImportError: + vllm = None + +WORKERS = int(os.getenv("WORKERS", "32")) + + +@register_model("longvila") +class LongVila(VLLMSimple): + is_simple = False + + def __init__( + self, + model="Efficient-Large-Model/LongVILA-R1-7B", + tensor_parallel_size=1, + data_parallel_size=1, + gpu_memory_utilization=0.5, + batch_size=1, + max_frame_num=32, + trust_remote_code=True, + chat_template=None, + max_pixels: int = 1605632, + min_image_pixels=28, + fps: Optional[int] = None, + device_map: Optional[str] = "cuda", + **kwargs, + ): + # vLLM requires the path to the autoregressive llm weights under the model root + model_root = model + llm_path = os.path.join(model_root, "llm") + # Enable prompt embeddings so we can pass encoder-produced embeddings directly + kwargs["enable_prompt_embeds"] = True + self.fps = fps + self.max_pixels = max_pixels + + # Set up imports from the model's remote_code directory + # The LongVILA repo provides preprocessing utilities we must call directly + try: + from remote_code.media import extract_media as _extract_media + from remote_code.mm_utils import process_images as _process_images + from remote_code.tokenizer_utils import ( + tokenize_conversation as _tokenize_conversation, + ) + except Exception as e: + raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}") + + self.extract_media = _extract_media + self.process_images = _process_images + self.tokenize_conversation = _tokenize_conversation + + # Load the encoder that produces prompt embeddings for the LLM + # llm_only_need_embed reduces memory usage to only what's needed for embedding + self.model_encoder = AutoModel.from_pretrained( + model_root, + trust_remote_code=True, + device_map=device_map, + llm_only_need_embed=True, + ) + super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs) + + def _to_remote_conversation(self, chat_messages: ChatMessages) -> list: + """ + Convert ChatMessages to LongVILA remote_code conversation format. + [{"from": "human"|"gpt", "value": [str | {"path": media_path}, ...]}, ...] + """ + role_map = {"user": "human", "assistant": "gpt", "system": "human"} + conversation = [] + for msg in chat_messages.messages: + from_role = role_map.get(msg.role, "human") + value_parts = [] + for content in msg.content: + # ChatTextContent + if getattr(content, "type", None) == "text": + value_parts.append(content.text) + # Images, Videos, Audios -> use path dicts as required by tokenizer_utils + elif getattr(content, "type", None) in ("image", "video", "audio"): + value_parts.append({"path": content.url}) + if value_parts: + conversation.append({"from": from_role, "value": value_parts}) + return conversation + + def make_one_request(self, request: Instance) -> Tuple["object", dict]: + """ + Build prompt embeddings and per-request sampling params from an Instance. + Returns (inputs_embeds, params_dict). Does not mutate input. + """ + ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments + raw_messages = doc_to_messages(self.task_dict[task][split][doc_id]) + chat_messages = ChatMessages(messages=raw_messages) + + # Copy to avoid side-effects across threads + _gen = dict(gen_kwargs or {}) + _gen.setdefault("max_new_tokens", 4096) + _gen.setdefault("temperature", 0) + _gen.setdefault("top_p", 0.95) + + params = { + "temperature": _gen["temperature"], + "max_tokens": _gen["max_new_tokens"], + "top_p": _gen["top_p"], + } + + # Convert to LongVILA remote_code conversation format + conversation = self._to_remote_conversation(chat_messages) + + # Extract and preprocess media + if self.fps: + self.model_encoder.config.fps = self.fps + else: + self.model_encoder.config.num_video_frames = self.max_frame_num + self.model_encoder.config.fps = 0 + media = self.extract_media(conversation, self.model_encoder.config) + if "video" in media and media["video"] is not None: + media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]] + + # Tokenize conversation and move to CUDA for embedding + input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda() + + # Create prompt embeddings using the model encoder + inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None) + + return inputs_embeds, params + + def generate_until(self, requests) -> List[str]: + res = [] + self.load_cache() + res, requests = self.get_response_from_cache(requests) + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + batch_size = self.batch_size_per_gpu + batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)] + e2e_latency = 0 + for batch_requests in batched_requests: + prompt_embeds_list = [] + params_list = [] + # Build embeddings sequentially to avoid GPU contention in the encoder + for req in tqdm(batch_requests, disable=(self.rank != 0), desc="Building embeddings"): + inputs_embeds, params = self.make_one_request(req) + prompt_embeds_list.append({"prompt_embeds": inputs_embeds.squeeze(0)}) + params_list.append(params) + + # For now, assume homogeneous sampling params within a batch + sampling_params = SamplingParams(**params_list[-1]) + + start_time = time.time() + response = self.client.generate(prompts=prompt_embeds_list, sampling_params=sampling_params) + end_time = time.time() + + response_text = [o.outputs[0].text for o in response] + for req, text in zip(batch_requests, response_text): + self.add_request_response_to_cache(req, text) + + # Calculate timing metrics for batch + e2e_latency += end_time - start_time + + assert len(response_text) == len(batch_requests) + res.extend(response_text) + pbar.update(len(batch_requests)) + + pbar.close() + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "GPT4V not support" + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation") diff --git a/lmms_eval/tasks/charades_sta/eval_tvg.py b/lmms_eval/tasks/charades_sta/eval_tvg.py index 6a43841fe..4689ad6de 100644 --- a/lmms_eval/tasks/charades_sta/eval_tvg.py +++ b/lmms_eval/tasks/charades_sta/eval_tvg.py @@ -40,7 +40,7 @@ def extract_time(paragraph): timestamps = [] # Check for The given query happens in m - n (seconds) - patterns = [r"(\d+\.*\d*)\s*-\s*(\d+\.*\d*)"] + patterns = [r"(\d+\.*\d*)\s*[–-]\s*(\d+\.*\d*)"] for time_pattern in patterns: time_matches = re.findall(time_pattern, paragraph) @@ -84,6 +84,16 @@ def extract_time(paragraph): times.append(time_in_sec) times = times[: len(times) // 2 * 2] timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)] + # Fallback: if no timestamps found, search for any two number patterns with dash + if len(timestamps) == 0: + # More comprehensive pattern to match various formats like: + # xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc. + # Also handle en dash (–) and regular dash (-) + fallback_pattern = r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?" + fallback_matches = re.findall(fallback_pattern, paragraph) + if fallback_matches: + timestamps = [[float(start), float(end)] for start, end in fallback_matches] + results = [] for start, end in timestamps: if end > start: diff --git a/lmms_eval/tasks/lvbench/lvbench.yaml b/lmms_eval/tasks/lvbench/lvbench.yaml new file mode 100644 index 000000000..58492f84d --- /dev/null +++ b/lmms_eval/tasks/lvbench/lvbench.yaml @@ -0,0 +1,28 @@ +dataset_path: lmms-lab/LVBench +dataset_kwargs: + token: True + cache_dir: lvbench + video: True + # From_YouTube: True +test_split: train +task: lvbench +output_type: generate_until +doc_to_visual: !function utils.lvbench_doc_to_visual +doc_to_text: !function utils.lvbench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 +# The return value of process_results will be used by metrics +process_results: !function utils.lvbench_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: lvbench_score + aggregation: mean + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question with the option letter" +metadata: + - version: 0.0 + diff --git a/lmms_eval/tasks/lvbench/utils.py b/lmms_eval/tasks/lvbench/utils.py new file mode 100644 index 000000000..8b16fd125 --- /dev/null +++ b/lmms_eval/tasks/lvbench/utils.py @@ -0,0 +1,74 @@ +import os +import re +from pathlib import Path + +import yaml + +hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/") +base_cache_dir = os.path.expanduser(hf_home) +with open(Path(__file__).parent / "lvbench.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) +cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"] + + +def lvbench_doc_to_visual(doc): + cache_dir = os.path.join(base_cache_dir, cache_name) + video_path = doc["video_path"] + assert os.path.exists(os.path.join(cache_dir, video_path)) + video_path = os.path.join(cache_dir, video_path) + return [video_path] + + +def lvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + if "pre_prompt" not in lmms_eval_specific_kwargs: + lmms_eval_specific_kwargs["pre_prompt"] = "" + if "post_prompt" not in lmms_eval_specific_kwargs: + lmms_eval_specific_kwargs["post_prompt"] = "\nAnswer the question with the option letter" + return lmms_eval_specific_kwargs["pre_prompt"] + doc["question"] + lmms_eval_specific_kwargs["post_prompt"] + + +def extract_characters_regex(s): + s = s.strip() + answer_prefixes = [ + "The best answer is", + "The correct answer is", + "The answer is", + "The answer", + "The best option is" "The correct option is", + "Best answer:" "Best option:", + ] + for answer_prefix in answer_prefixes: + s = s.replace(answer_prefix, "") + + if len(s.split()) > 10 and not re.search("[ABCD]", s): + return "" + + matches = re.search(r"[ABCD]", s) + if matches is None: + return "" + return matches[0] + + +def lvbench_process_results(doc, results): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case videomme score), value: metric value + """ + pred = results[0] + pred_ans = extract_characters_regex(pred) + # gt_ans = doc["answer"].lower().strip().replace(".", "") + gt_ans = doc["answer"] + score = pred_ans == gt_ans + + # return {f"videomme_perception_score": data_dict for metric in matrices} + return {f"lvbench_score": score}