EvolvingLMMs-Lab · kcz358 · Sep 17, 2025 · Sep 8, 2025 · Sep 11, 2025 · Aug 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -50,3 +50,4 @@ span.log
 uv.lock
 workspace/*
 .claude/*
+remote_code/*
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -88,6 +88,7 @@
     "sglang": "Sglang",
     "huggingface": "Huggingface",
     "async_openai": "AsyncOpenAIChat",
+    "longvila": "LongVila",
 }
 
 

diff --git a/lmms_eval/models/chat/longvila.py b/lmms_eval/models/chat/longvila.py
@@ -0,0 +1,184 @@
+import os
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Optional, Tuple, Union
+
+from tqdm import tqdm
+from transformers import AutoModel
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.model_utils.gen_metrics import log_metrics
+from lmms_eval.models.simple.vllm import VLLM as VLLMSimple
+from lmms_eval.protocol import ChatMessages
+
+try:
+    from vllm import LLM, SamplingParams
+except ImportError:
+    vllm = None
+
-try:
-    from vllm import LLM, SamplingParams
-except ImportError:
-    vllm = None
+try:
+    from vllm import LLM, SamplingParams
+except ImportError as _vllm_err:
+    LLM = None  # type: ignore[assignment]
+    SamplingParams = None  # type: ignore[assignment]
+    _VLLM_IMPORT_ERROR = _vllm_err
-try:
-    from vllm import LLM, SamplingParams
-except ImportError:
-    vllm = None
+try:
+    from vllm import LLM, SamplingParams
+except ImportError as _vllm_err:
+    LLM = None  # type: ignore[assignment]
+    SamplingParams = None  # type: ignore[assignment]
+    _VLLM_IMPORT_ERROR = _vllm_err
+WORKERS = int(os.getenv("WORKERS", "32"))
+
+
+@register_model("longvila")
+class LongVila(VLLMSimple):
+    is_simple = False
+
+    def __init__(
+        self,
+        model="Efficient-Large-Model/LongVILA-R1-7B",
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        gpu_memory_utilization=0.5,
+        batch_size=1,
+        max_frame_num=32,
+        trust_remote_code=True,
+        chat_template=None,
+        max_pixels: int = 1605632,
+        min_image_pixels=28,
+        fps: Optional[int] = None,
+        device_map: Optional[str] = "cuda",
+        **kwargs,
+    ):
+        # vLLM requires the path to the autoregressive llm weights under the model root
+        model_root = model
+        llm_path = os.path.join(model_root, "llm")
+        # Enable prompt embeddings so we can pass encoder-produced embeddings directly
-    def __init__(
-        self,
-        model="Efficient-Large-Model/LongVILA-R1-7B",
-        tensor_parallel_size=1,
-        data_parallel_size=1,
-        gpu_memory_utilization=0.5,
-        batch_size=1,
-        max_frame_num=32,
-        trust_remote_code=True,
-        chat_template=None,
-        max_pixels: int = 1605632,
-        min_image_pixels=28,
-        fps: Optional[int] = None,
-        device_map: Optional[str] = "cuda",
-        **kwargs,
-    ):
-        # vLLM requires the path to the autoregressive llm weights under the model root
-        model_root = model
-        llm_path = os.path.join(model_root, "llm")
-        # Enable prompt embeddings so we can pass encoder-produced embeddings directly
+    def __init__(
+        self,
+        model="Efficient-Large-Model/LongVILA-R1-7B",
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        gpu_memory_utilization=0.5,
+        batch_size=1,
+        max_frame_num=32,
+        trust_remote_code=True,
+        chat_template=None,
+        max_pixels: int = 1605632,
+        min_image_pixels=28,
+        fps: Optional[int] = None,
+        device_map: Optional[str] = "cuda",
+        **kwargs,
+    ):
+        # vLLM requires the path to the autoregressive llm weights under the model root
+        model_root = model
+        llm_path = os.path.join(model_root, "llm")
+        if not os.path.isdir(llm_path):
+            raise FileNotFoundError(
+                f"Expected autoregressive LLM under '{llm_path}'. "
+                "Verify your LongVILA model layout."
+            )
+        # Enable prompt embeddings so we can pass encoder-produced embeddings directly
+        kwargs["enable_prompt_embeds"] = True
+        super().__init__(
+            llm_path,
+            tensor_parallel_size,
+            data_parallel_size,
+            gpu_memory_utilization,
+            batch_size,
+            max_frame_num,
+            trust_remote_code,
+            chat_template,
+            min_image_pixels,
+            **kwargs,
+        )
-    def __init__(
-        self,
-        model="Efficient-Large-Model/LongVILA-R1-7B",
-        tensor_parallel_size=1,
-        data_parallel_size=1,
-        gpu_memory_utilization=0.5,
-        batch_size=1,
-        max_frame_num=32,
-        trust_remote_code=True,
-        chat_template=None,
-        max_pixels: int = 1605632,
-        min_image_pixels=28,
-        fps: Optional[int] = None,
-        device_map: Optional[str] = "cuda",
-        **kwargs,
-    ):
-        # vLLM requires the path to the autoregressive llm weights under the model root
-        model_root = model
-        llm_path = os.path.join(model_root, "llm")
-        # Enable prompt embeddings so we can pass encoder-produced embeddings directly
+    def __init__(
+        self,
+        model="Efficient-Large-Model/LongVILA-R1-7B",
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        gpu_memory_utilization=0.5,
+        batch_size=1,
+        max_frame_num=32,
+        trust_remote_code=True,
+        chat_template=None,
+        max_pixels: int = 1605632,
+        min_image_pixels=28,
+        fps: Optional[int] = None,
+        device_map: Optional[str] = "cuda",
+        **kwargs,
+    ):
+        # vLLM requires the path to the autoregressive llm weights under the model root
+        model_root = model
+        llm_path = os.path.join(model_root, "llm")
+        if not os.path.isdir(llm_path):
+            raise FileNotFoundError(
+                f"Expected autoregressive LLM under '{llm_path}'. "
+                "Verify your LongVILA model layout."
+            )
+        # Enable prompt embeddings so we can pass encoder-produced embeddings directly
+        kwargs["enable_prompt_embeds"] = True
+        super().__init__(
+            llm_path,
+            tensor_parallel_size,
+            data_parallel_size,
+            gpu_memory_utilization,
+            batch_size,
+            max_frame_num,
+            trust_remote_code,
+            chat_template,
+            min_image_pixels,
+            **kwargs,
+        )
+        kwargs["enable_prompt_embeds"] = True
+        self.fps = fps
+        self.max_pixels = max_pixels
+
+        # Set up imports from the model's remote_code directory
+        # The LongVILA repo provides preprocessing utilities we must call directly
+        try:
+            from remote_code.media import extract_media as _extract_media
+            from remote_code.mm_utils import process_images as _process_images
+            from remote_code.tokenizer_utils import (
+                tokenize_conversation as _tokenize_conversation,
+            )
+        except Exception as e:
+            raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
+
+        self.extract_media = _extract_media
+        self.process_images = _process_images
+        self.tokenize_conversation = _tokenize_conversation
+
-        # Set up imports from the model's remote_code directory
-        # The LongVILA repo provides preprocessing utilities we must call directly
-        try:
-            from remote_code.media import extract_media as _extract_media
-            from remote_code.mm_utils import process_images as _process_images
-            from remote_code.tokenizer_utils import (
-                tokenize_conversation as _tokenize_conversation,
-            )
-        except Exception as e:
-            raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
-
-        self.extract_media = _extract_media
-        self.process_images = _process_images
-        self.tokenize_conversation = _tokenize_conversation
+        # Set up imports from the model's remote_code directory
+        # The LongVILA repo provides preprocessing utilities we must call directly
+        try:
+            remote_code_dir = os.path.join(model_root)
+            if remote_code_dir not in sys.path:
+                sys.path.insert(0, remote_code_dir)
+            from remote_code.media import extract_media as _extract_media  # type: ignore
+            from remote_code.mm_utils import process_images as _process_images  # type: ignore
+            from remote_code.tokenizer_utils import (  # type: ignore
+                tokenize_conversation as _tokenize_conversation,
+            )
+        except Exception as err:
+            raise ImportError(
+                f"Failed to import LongVILA remote_code from '{model_root}'. "
+                "Ensure the model path contains a 'remote_code/' package."
+            ) from err
+
+        self.extract_media = _extract_media
+        self.process_images = _process_images
+        self.tokenize_conversation = _tokenize_conversation
-        # Set up imports from the model's remote_code directory
-        # The LongVILA repo provides preprocessing utilities we must call directly
-        try:
-            from remote_code.media import extract_media as _extract_media
-            from remote_code.mm_utils import process_images as _process_images
-            from remote_code.tokenizer_utils import (
-                tokenize_conversation as _tokenize_conversation,
-            )
-        except Exception as e:
-            raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}")
-
-        self.extract_media = _extract_media
-        self.process_images = _process_images
-        self.tokenize_conversation = _tokenize_conversation
+        # Set up imports from the model's remote_code directory
+        # The LongVILA repo provides preprocessing utilities we must call directly
+        try:
+            remote_code_dir = os.path.join(model_root)
+            if remote_code_dir not in sys.path:
+                sys.path.insert(0, remote_code_dir)
+            from remote_code.media import extract_media as _extract_media  # type: ignore
+            from remote_code.mm_utils import process_images as _process_images  # type: ignore
+            from remote_code.tokenizer_utils import (  # type: ignore
+                tokenize_conversation as _tokenize_conversation,
+            )
+        except Exception as err:
+            raise ImportError(
+                f"Failed to import LongVILA remote_code from '{model_root}'. "
+                "Ensure the model path contains a 'remote_code/' package."
+            ) from err
+
+        self.extract_media = _extract_media
+        self.process_images = _process_images
+        self.tokenize_conversation = _tokenize_conversation
+        # Load the encoder that produces prompt embeddings for the LLM
+        # llm_only_need_embed reduces memory usage to only what's needed for embedding
+        self.model_encoder = AutoModel.from_pretrained(
+            model_root,
+            trust_remote_code=True,
+            device_map=device_map,
+            llm_only_need_embed=True,
+        )
+        super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs)
+
+    def _to_remote_conversation(self, chat_messages: ChatMessages) -> list:
+        """
+        Convert ChatMessages to LongVILA remote_code conversation format.
+        [{"from": "human"|"gpt", "value": [str | {"path": media_path}, ...]}, ...]
+        """
+        role_map = {"user": "human", "assistant": "gpt", "system": "human"}
+        conversation = []
+        for msg in chat_messages.messages:
+            from_role = role_map.get(msg.role, "human")
+            value_parts = []
+            for content in msg.content:
+                # ChatTextContent
+                if getattr(content, "type", None) == "text":
+                    value_parts.append(content.text)
+                # Images, Videos, Audios -> use path dicts as required by tokenizer_utils
+                elif getattr(content, "type", None) in ("image", "video", "audio"):
+                    value_parts.append({"path": content.url})
+            if value_parts:
+                conversation.append({"from": from_role, "value": value_parts})
+        return conversation
+
+    def make_one_request(self, request: Instance) -> Tuple["object", dict]:
+        """
+        Build prompt embeddings and per-request sampling params from an Instance.
+        Returns (inputs_embeds, params_dict). Does not mutate input.
+        """
+        ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
+        raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
+        chat_messages = ChatMessages(messages=raw_messages)
+
+        # Copy to avoid side-effects across threads
+        _gen = dict(gen_kwargs or {})
+        _gen.setdefault("max_new_tokens", 4096)
+        _gen.setdefault("temperature", 0)
+        _gen.setdefault("top_p", 0.95)
+
+        params = {
+            "temperature": _gen["temperature"],
+            "max_tokens": _gen["max_new_tokens"],
+            "top_p": _gen["top_p"],
+        }
+
+        # Convert to LongVILA remote_code conversation format
+        conversation = self._to_remote_conversation(chat_messages)
+
+        # Extract and preprocess media
+        if self.fps:
+            self.model_encoder.config.fps = self.fps
+        else:
+            self.model_encoder.config.num_video_frames = self.max_frame_num
+            self.model_encoder.config.fps = 0
+        media = self.extract_media(conversation, self.model_encoder.config)
+        if "video" in media and media["video"] is not None:
+            media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
+
+        # Tokenize conversation and move to CUDA for embedding
+        input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
+
+        # Create prompt embeddings using the model encoder
+        inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
+
+        return inputs_embeds, params
+
-    def make_one_request(self, request: Instance) -> Tuple["object", dict]:
-        """
-        Build prompt embeddings and per-request sampling params from an Instance.
-        Returns (inputs_embeds, params_dict). Does not mutate input.
-        """
-        ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
-        raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
-        chat_messages = ChatMessages(messages=raw_messages)
-
-        # Copy to avoid side-effects across threads
-        _gen = dict(gen_kwargs or {})
-        _gen.setdefault("max_new_tokens", 4096)
-        _gen.setdefault("temperature", 0)
-        _gen.setdefault("top_p", 0.95)
-
-        params = {
-            "temperature": _gen["temperature"],
-            "max_tokens": _gen["max_new_tokens"],
-            "top_p": _gen["top_p"],
-        }
-
-        # Convert to LongVILA remote_code conversation format
-        conversation = self._to_remote_conversation(chat_messages)
-
-        # Extract and preprocess media
-        if self.fps:
-            self.model_encoder.config.fps = self.fps
-        else:
-            self.model_encoder.config.num_video_frames = self.max_frame_num
-            self.model_encoder.config.fps = 0
-        media = self.extract_media(conversation, self.model_encoder.config)
-        if "video" in media and media["video"] is not None:
-            media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
-
-        # Tokenize conversation and move to CUDA for embedding
-        input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
-
-        # Create prompt embeddings using the model encoder
-        inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
-
-        return inputs_embeds, params
+    def make_one_request(self, request: Instance) -> Tuple["object", dict]:
+        """
+        Build prompt embeddings and per-request sampling params from an Instance.
+        Returns (inputs_embeds, params_dict). Does not mutate input.
+        """
+        ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
+        raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
+        chat_messages = ChatMessages(messages=raw_messages)
+
+        # Copy to avoid side-effects across threads
+        _gen = dict(gen_kwargs or {})
+        _gen.setdefault("max_new_tokens", 4096)
+        _gen.setdefault("temperature", 0)
+        _gen.setdefault("top_p", 0.95)
+
+        params = {
+            "temperature": _gen["temperature"],
+            "max_tokens": _gen["max_new_tokens"],
+            "top_p": _gen["top_p"],
+        }
+
+        # Convert to LongVILA remote_code conversation format
+        conversation = self._to_remote_conversation(chat_messages)
+
+        # Extract and preprocess media
+        if self.fps:
+            self.model_encoder.config.fps = self.fps
+        else:
+            self.model_encoder.config.num_video_frames = self.max_frame_num
+            self.model_encoder.config.fps = 0
+        media = self.extract_media(conversation, self.model_encoder.config)
+        if "video" in media and media["video"] is not None:
+            media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
+
+        # Tokenize conversation and move to CUDA for embedding
+        input_ids = (
+            self.tokenize_conversation(
+                conversation,
+                self.model_encoder.tokenizer,
+                add_generation_prompt=True,
+            )
+            .unsqueeze(0)
+            .to(next(self.model_encoder.parameters()).device)
+        )
+
+        # Create prompt embeddings using the model encoder
+        inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
+
+        return inputs_embeds, params
-    def make_one_request(self, request: Instance) -> Tuple["object", dict]:
-        """
-        Build prompt embeddings and per-request sampling params from an Instance.
-        Returns (inputs_embeds, params_dict). Does not mutate input.
-        """
-        ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
-        raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
-        chat_messages = ChatMessages(messages=raw_messages)
-
-        # Copy to avoid side-effects across threads
-        _gen = dict(gen_kwargs or {})
-        _gen.setdefault("max_new_tokens", 4096)
-        _gen.setdefault("temperature", 0)
-        _gen.setdefault("top_p", 0.95)
-
-        params = {
-            "temperature": _gen["temperature"],
-            "max_tokens": _gen["max_new_tokens"],
-            "top_p": _gen["top_p"],
-        }
-
-        # Convert to LongVILA remote_code conversation format
-        conversation = self._to_remote_conversation(chat_messages)
-
-        # Extract and preprocess media
-        if self.fps:
-            self.model_encoder.config.fps = self.fps
-        else:
-            self.model_encoder.config.num_video_frames = self.max_frame_num
-            self.model_encoder.config.fps = 0
-        media = self.extract_media(conversation, self.model_encoder.config)
-        if "video" in media and media["video"] is not None:
-            media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
-
-        # Tokenize conversation and move to CUDA for embedding
-        input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
-
-        # Create prompt embeddings using the model encoder
-        inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
-
-        return inputs_embeds, params
+    def make_one_request(self, request: Instance) -> Tuple["object", dict]:
+        """
+        Build prompt embeddings and per-request sampling params from an Instance.
+        Returns (inputs_embeds, params_dict). Does not mutate input.
+        """
+        ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments
+        raw_messages = doc_to_messages(self.task_dict[task][split][doc_id])
+        chat_messages = ChatMessages(messages=raw_messages)
+
+        # Copy to avoid side-effects across threads
+        _gen = dict(gen_kwargs or {})
+        _gen.setdefault("max_new_tokens", 4096)
+        _gen.setdefault("temperature", 0)
+        _gen.setdefault("top_p", 0.95)
+
+        params = {
+            "temperature": _gen["temperature"],
+            "max_tokens": _gen["max_new_tokens"],
+            "top_p": _gen["top_p"],
+        }
+
+        # Convert to LongVILA remote_code conversation format
+        conversation = self._to_remote_conversation(chat_messages)
+
+        # Extract and preprocess media
+        if self.fps:
+            self.model_encoder.config.fps = self.fps
+        else:
+            self.model_encoder.config.num_video_frames = self.max_frame_num
+            self.model_encoder.config.fps = 0
+        media = self.extract_media(conversation, self.model_encoder.config)
+        if "video" in media and media["video"] is not None:
+            media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]]
+
+        # Tokenize conversation and move to CUDA for embedding
+        input_ids = (
+            self.tokenize_conversation(
+                conversation,
+                self.model_encoder.tokenizer,
+                add_generation_prompt=True,
+            )
+            .unsqueeze(0)
+            .to(next(self.model_encoder.parameters()).device)
+        )
+
+        # Create prompt embeddings using the model encoder
+        inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None)
+
+        return inputs_embeds, params
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        self.load_cache()
+        res, requests = self.get_response_from_cache(requests)
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        batch_size = self.batch_size_per_gpu
+        batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)]
+        e2e_latency = 0
+        for batch_requests in batched_requests:
+            prompt_embeds_list = []
+            params_list = []
+            # Build embeddings sequentially to avoid GPU contention in the encoder
+            for req in tqdm(batch_requests, disable=(self.rank != 0), desc="Building embeddings"):
+                inputs_embeds, params = self.make_one_request(req)
+                prompt_embeds_list.append({"prompt_embeds": inputs_embeds.squeeze(0)})
+                params_list.append(params)
+
+            # For now, assume homogeneous sampling params within a batch
+            sampling_params = SamplingParams(**params_list[-1])
+
+            start_time = time.time()
+            response = self.client.generate(prompts=prompt_embeds_list, sampling_params=sampling_params)
+            end_time = time.time()
+
+            response_text = [o.outputs[0].text for o in response]
+            for req, text in zip(batch_requests, response_text):
+                self.add_request_response_to_cache(req, text)
+
+            # Calculate timing metrics for batch
+            e2e_latency += end_time - start_time
+
+            assert len(response_text) == len(batch_requests)
+            res.extend(response_text)
+            pbar.update(len(batch_requests))
+
+        pbar.close()
+        return res
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        assert False, "GPT4V not support"
+
+    def generate_until_multi_round(self, requests) -> List[str]:
+        raise NotImplementedError("TODO: Implement multi-round generation")
-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        # TODO
-        assert False, "GPT4V not support"
-
-    def generate_until_multi_round(self, requests) -> List[str]:
-        raise NotImplementedError("TODO: Implement multi-round generation")
+    def loglikelihood(self, _requests: List[Instance]) -> List[Tuple[float, bool]]:  # noqa: ARG002
+        raise NotImplementedError("LongVila does not support loglikelihood.")
+
+    def generate_until_multi_round(self, requests) -> List[str]:
+        raise NotImplementedError("TODO: Implement multi-round generation")
-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        # TODO
-        assert False, "GPT4V not support"
-
-    def generate_until_multi_round(self, requests) -> List[str]:
-        raise NotImplementedError("TODO: Implement multi-round generation")
+    def loglikelihood(self, _requests: List[Instance]) -> List[Tuple[float, bool]]:  # noqa: ARG002
+        raise NotImplementedError("LongVila does not support loglikelihood.")
+
+    def generate_until_multi_round(self, requests) -> List[str]:
+        raise NotImplementedError("TODO: Implement multi-round generation")
diff --git a/lmms_eval/tasks/lvbench/lvbench.yaml b/lmms_eval/tasks/lvbench/lvbench.yaml
@@ -0,0 +1,28 @@
+dataset_path: lmms-lab/LVBench
+dataset_kwargs:
+  token: True
+  cache_dir: lvbench
+  video: True
+  # From_YouTube: True
+test_split: train
+task: lvbench
+output_type: generate_until
+doc_to_visual: !function utils.lvbench_doc_to_visual
+doc_to_text: !function utils.lvbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+# The return value of process_results will be used by metrics
+process_results: !function utils.lvbench_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: lvbench_score
+    aggregation: mean
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question with the option letter"
+metadata:
+  - version: 0.0
+
diff --git a/lmms_eval/tasks/lvbench/utils.py b/lmms_eval/tasks/lvbench/utils.py
@@ -0,0 +1,74 @@
+import os
+import re
+from pathlib import Path
+
+import yaml
+
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
-hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
-base_cache_dir = os.path.expanduser(hf_home)
-with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
-    raw_data = f.readlines()
-    safe_data = []
-    for i, line in enumerate(raw_data):
-        # remove function definition since yaml load cannot handle it
-        if "!function" not in line:
-            safe_data.append(line)
-cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for _i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+data = yaml.safe_load("".join(safe_data)) or {}
+try:
+    cache_name = data["dataset_kwargs"]["cache_dir"]
+except KeyError as exc:
+    raise KeyError("Expected 'dataset_kwargs.cache_dir' in lvbench.yaml") from exc
-hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
-base_cache_dir = os.path.expanduser(hf_home)
-with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
-    raw_data = f.readlines()
-    safe_data = []
-    for i, line in enumerate(raw_data):
-        # remove function definition since yaml load cannot handle it
-        if "!function" not in line:
-            safe_data.append(line)
-cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "lvbench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for _i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+data = yaml.safe_load("".join(safe_data)) or {}
+try:
+    cache_name = data["dataset_kwargs"]["cache_dir"]
+except KeyError as exc:
+    raise KeyError("Expected 'dataset_kwargs.cache_dir' in lvbench.yaml") from exc
+
+
+def lvbench_doc_to_visual(doc):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = doc["video_path"]
+    assert os.path.exists(os.path.join(cache_dir, video_path))
+    video_path = os.path.join(cache_dir, video_path)
+    return [video_path]
+
+
+def lvbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    if "pre_prompt" not in lmms_eval_specific_kwargs:
+        lmms_eval_specific_kwargs["pre_prompt"] = ""
+    if "post_prompt" not in lmms_eval_specific_kwargs:
+        lmms_eval_specific_kwargs["post_prompt"] = "\nAnswer the question with the option letter"
+    return lmms_eval_specific_kwargs["pre_prompt"] + doc["question"] + lmms_eval_specific_kwargs["post_prompt"]
+
+
+def extract_characters_regex(s):
+    s = s.strip()
+    answer_prefixes = [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "The best option is" "The correct option is",
+        "Best answer:" "Best option:",
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, "")
+
+    if len(s.split()) > 10 and not re.search("[ABCD]", s):
+        return ""
+
+    matches = re.search(r"[ABCD]", s)
+    if matches is None:
+        return ""
+    return matches[0]
-def extract_characters_regex(s):
-    s = s.strip()
-    answer_prefixes = [
-        "The best answer is",
-        "The correct answer is",
-        "The answer is",
-        "The answer",
-        "The best option is" "The correct option is",
-        "Best answer:" "Best option:",
-    ]
-    for answer_prefix in answer_prefixes:
-        s = s.replace(answer_prefix, "")
-
-    if len(s.split()) > 10 and not re.search("[ABCD]", s):
-        return ""
-
-    matches = re.search(r"[ABCD]", s)
-    if matches is None:
-        return ""
-    return matches[0]
+from typing import Optional
+def extract_characters_regex(s: str) -> str:
+    s = s.strip()
+    answer_prefixes = [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "The best option is",
+        "The correct option is",
+        "Best answer:",
+        "Best option:",
+    ]
+    for answer_prefix in answer_prefixes:
+        s = re.sub(re.escape(answer_prefix), "", s, flags=re.IGNORECASE).strip()
+
+    if len(s.split()) > 10 and not re.search(r"\b[ABCD]\b", s, flags=re.I):
+        return ""
+
+    matches = re.search(r"\b([ABCD])\b", s, flags=re.I)
+    if matches is None:
+        return ""
+    return matches.group(1).upper()
-def extract_characters_regex(s):
-    s = s.strip()
-    answer_prefixes = [
-        "The best answer is",
-        "The correct answer is",
-        "The answer is",
-        "The answer",
-        "The best option is" "The correct option is",
-        "Best answer:" "Best option:",
-    ]
-    for answer_prefix in answer_prefixes:
-        s = s.replace(answer_prefix, "")
-
-    if len(s.split()) > 10 and not re.search("[ABCD]", s):
-        return ""
-
-    matches = re.search(r"[ABCD]", s)
-    if matches is None:
-        return ""
-    return matches[0]
+from typing import Optional
+def extract_characters_regex(s: str) -> str:
+    s = s.strip()
+    answer_prefixes = [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "The best option is",
+        "The correct option is",
+        "Best answer:",
+        "Best option:",
+    ]
+    for answer_prefix in answer_prefixes:
+        s = re.sub(re.escape(answer_prefix), "", s, flags=re.IGNORECASE).strip()
+
+    if len(s.split()) > 10 and not re.search(r"\b[ABCD]\b", s, flags=re.I):
+        return ""
+
+    matches = re.search(r"\b([ABCD])\b", s, flags=re.I)
+    if matches is None:
+        return ""
+    return matches.group(1).upper()
+
+
+def lvbench_process_results(doc, results):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case videomme score), value: metric value
+    """
+    pred = results[0]
+    pred_ans = extract_characters_regex(pred)
+    # gt_ans = doc["answer"].lower().strip().replace(".", "")
+    gt_ans = doc["answer"]
+    score = pred_ans == gt_ans
+
+    # return {f"videomme_perception_score": data_dict for metric in matrices}
+    return {f"lvbench_score": score}