EvolvingLMMs-Lab
diff --git a/‎lmms_eval/models/chat/vllm.py‎
Lines changed: 1 addition & 18 deletions b/‎lmms_eval/models/chat/vllm.py‎
Lines changed: 1 addition & 18 deletions
diff --git a/‎lmms_eval/models/simple/vllm.py‎
Lines changed: 4 additions & 5 deletions b/‎lmms_eval/models/simple/vllm.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎lmms_eval/tasks/_task_utils/math_verify_utils.py‎
Lines changed: 162 additions & 0 deletions b/‎lmms_eval/tasks/_task_utils/math_verify_utils.py‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/olympiadbench_official/en_utils.py‎
Lines changed: 80 additions & 0 deletions b/‎lmms_eval/tasks/olympiadbench_official/en_utils.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/olympiadbench_official/olympiadbench_all_boxed.yaml‎
Lines changed: 38 additions & 0 deletions b/‎lmms_eval/tasks/olympiadbench_official/olympiadbench_all_boxed.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/olympiadbench_official/olympiadbench_boxed.yaml‎
Lines changed: 38 additions & 0 deletions b/‎lmms_eval/tasks/olympiadbench_official/olympiadbench_boxed.yaml‎
Lines changed: 38 additions & 0 deletions
@@ -1,29 +1,12 @@
-import asyncio
-import base64
-import json
-import os
-import time
-from concurrent.futures import ThreadPoolExecutor
-from copy import deepcopy
-from io import BytesIO
-from multiprocessing import cpu_count
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple
 
-import numpy as np
-from accelerate import Accelerator, DistributedType
-from decord import VideoReader, cpu
-from loguru import logger as eval_logger
-from PIL import Image
 from tqdm import tqdm
 
 from lmms_eval.api.instance import Instance
-from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
 from lmms_eval.models.simple.vllm import VLLM as VLLMSimple
 from lmms_eval.protocol import ChatMessages
 
-NUM_SECONDS_TO_SLEEP = 5
-
 try:
     from vllm import LLM, SamplingParams
 except ImportError:
 
@@ -20,7 +20,8 @@
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
 
-NUM_SECONDS_TO_SLEEP = 5
+NUM_SECONDS_TO_SLEEP = os.getenv("NUM_SECONDS_TO_SLEEP", 5)
+WORKERS = os.getenv("WORKERS", 32)
 
 try:
     from vllm import LLM, SamplingParams
@@ -37,7 +38,6 @@ def __init__(
         gpu_memory_utilization: float = 0.8,
         batch_size: int = 1,
         max_frame_num: int = 32,
-        threads: int = 16,  # Threads to use for decoding visuals
         trust_remote_code: Optional[bool] = True,
         chat_template: Optional[str] = None,
         min_image_pixels: int = 28,  # minimum image dimension, required for Qwen 2/2.5-VL models
@@ -49,11 +49,10 @@ def __init__(
         # Here we just use the same token as llava for convenient
         self.model = model
         self.max_frame_num = max_frame_num
-        self.threads = threads
         self.chat_template = chat_template
         self.min_image_pixels = min_image_pixels
         # Qwen 2/2.5-VL models enforce minimum image dimensions
-        self._enforce_image_resize = self._is_qwen_vl_model(model_version)
+        self._enforce_image_resize = self._is_qwen_vl_model(model)
 
         # Convert any string arguments that start with { and end with } to dictionaries
         for key, value in kwargs.items():
@@ -188,7 +187,7 @@ def generate_until(self, requests) -> List[str]:
                     visuals = self.flatten(visuals)
                     imgs = []  # multiple images or frames for video
                     all_tasks = []
-                    with ThreadPoolExecutor(max_workers=self.threads) as executor:
+                    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
                         for visual in visuals:
                             if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
                                 all_tasks.append(executor.submit(self.encode_video, visual))
 
@@ -0,0 +1,162 @@
+# Copyright 2025 Xiaomi Corporation.
+
+
+import importlib
+
+from wrapt_timeout_decorator import timeout
+
+
+def patch_target_module(
+    to_patch: str,
+    replace_with,
+):
+    to_patch = to_patch.split(".")
+    assert len(to_patch) > 1, "must have an object to patch"
+
+    to_patch, obj_name_to_patch = to_patch[:-1], to_patch[-1]
+    to_patch = ".".join(to_patch)
+    source = importlib.import_module(to_patch)
+    setattr(source, obj_name_to_patch, replace_with)
+
+
+def timeout_adapter(func=None, **kwargs):
+    timeout_val = kwargs.pop("timeout_seconds", None)
+    return timeout(dec_timeout=timeout_val, use_signals=False, **kwargs)
+
+
+# replace the signal-based timeout with a non-signal-based timeout to allow multithreading
+patch_target_module("math_verify.utils.timeout", timeout_adapter)
+patch_target_module("math_verify.parser.timeout", timeout_adapter)
+patch_target_module("math_verify.grader.timeout", timeout_adapter)
+
+
+import os
+
+from latex2sympy2_extended.latex2sympy2 import NormalizationConfig
+from math_verify import *
+
+
+def monkeypatch_math_verify_logger():
+    """
+    replace the loggers in math_verify with a self-returning object, so that it does not print any logs
+    """
+    import math_verify
+
+    class SelfReturningObject:
+        def __getattr__(self, name):
+            return self
+
+        def __call__(self, *args, **kwargs):
+            return self
+
+        def __getitem__(self, key):
+            return self
+
+    self_returning_object = SelfReturningObject()
+
+    def bfs_search(module, lst):
+        lst.append(module)
+        for name, obj in module.__dict__.items():
+            if isinstance(obj, type(math_verify)):
+                if obj not in lst:
+                    bfs_search(obj, lst)
+
+    all_modules = []
+    bfs_search(math_verify, all_modules)
+    all_modules = [module for module in all_modules if module.__name__.startswith("math_verify")]
+    for module in all_modules:
+        if hasattr(module, "logger"):
+            module.logger = self_returning_object
+
+
+class MathVerifyFn:
+    def __init__(self, correct_score=1.0, incorrect_score=0.0, timeout_seconds=10, strict=True, silent=True):
+        self.correct_score = correct_score
+        self.incorrect_score = incorrect_score
+        self.timeout_seconds = timeout_seconds
+        self.strict = strict
+        if silent:
+            monkeypatch_math_verify_logger()
+
+    def __call__(self, solution_str: str, ground_truth) -> float:
+        # return self.compute_score(solution_str, ground_truth)
+        return self.compute_score_with_ext(solution_str, ground_truth)
+
+    def preprocess_answer(self, annotated_answer: str) -> str:
+        if annotated_answer:
+            if annotated_answer.startswith("$") and annotated_answer.endswith("$"):
+                annotated_answer = f"\\boxed{{{annotated_answer.strip('$')}}}"
+            elif "\\boxed" not in annotated_answer:
+                annotated_answer = f"\\boxed{{{annotated_answer}}}"
+        return annotated_answer
+
+    def parse_LatexExpr(self, input_str: str):
+        config = NormalizationConfig(
+            basic_latex=True,
+            units=True,
+            malformed_operators=True,
+            nits=True,
+            boxed="last",
+            equations=False,
+        )
+        return parse(
+            input_str,
+            extraction_mode="first_match",
+            extraction_config=[
+                LatexExtractionConfig(boxed_match_priority=0, normalization_config=config),
+            ],
+            parsing_timeout=self.timeout_seconds,
+        )
+
+    def parse_String(self, input_str: str):
+        return parse(
+            input_str,
+            extraction_mode="first_match",
+            extraction_config=[
+                StringExtractionConfig(),
+            ],
+            parsing_timeout=self.timeout_seconds,
+        )
+
+    def judge_with_ext(self, solution_str: str, ground_truth) -> float:
+        prediction_str = solution_str
+        answer_str = self.preprocess_answer(ground_truth)
+        answer_parsed = self.parse_LatexExpr(answer_str)
+
+        def _judger(x):
+            if len(x) == 0:
+                return False
+            if verify(answer_parsed, x, timeout_seconds=self.timeout_seconds, strict=self.strict):
+                return True
+            return False
+
+        def ext_to_str(x):
+            for item in x:
+                if isinstance(item, str):
+                    return item
+            for item in x:
+                return str(item)
+            return ""
+
+        ext_pred = self.parse_LatexExpr(prediction_str)
+        ext_str = ext_to_str(ext_pred)
+        # print(solution_str[:20], ground_truth, ext_pred, ext_str, _judger(ext_pred))
+        if _judger(ext_pred):
+            return True, ext_str
+        return False, ext_str
+
+    def compute_score_with_ext(self, solution_str: str, ground_truth) -> float:
+        try:
+            is_correct, ext_pred = self.judge_with_ext(solution_str, ground_truth)
+            if is_correct:
+                return self.correct_score, ext_pred
+            else:
+                return self.incorrect_score, ext_pred
+        except Exception as e:
+            print(e)
+            return self.incorrect_score, ""
+
+
+if __name__ == "__main__":
+    math_verify_fn = MathVerifyFn()
+    print(math_verify_fn("\\boxed{D}", "D"))
@@ -0,0 +1,80 @@
+# Copyright 2025 Xiaomi Corporation.
+
+import datetime
+import json
+import os
+
+from loguru import logger as eval_logger
+
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+from lmms_eval.tasks.olympiadbench_official.olympiadbench_evals import (
+    OlympiadBenchEvaluator,
+)
+
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+olympiadbench_evaluator = OlympiadBenchEvaluator()
+
+
+def olympiadbench_doc_to_visual(doc):
+    res = []
+    for i in range(1, 6):
+        image_key = f"image_{i}"
+        if doc[image_key] is not None:
+            res.append(doc[image_key])
+    return [image.convert("RGB") for image in res]
+
+
+def olympiadbench_doc_to_text(doc):
+    question = doc["question"]
+    subject = doc["subfield"]
+    mul_ans = doc["is_multiple_answer"]
+    if mul_ans is None:
+        mul_ans = False
+    ans_type = doc["answer_type"]
+    if ans_type == "Need_human_evaluate":
+        ans_type = "proof based"
+
+    pre_prompt = f"The following is a question from an International {subject} competition.\n"
+
+    post_prompt = ""
+    if not mul_ans:
+        post_prompt += f"The answer of the question should be {ans_type}.\n"
+    else:
+        post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n"
+    post_prompt += (
+        "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
+    )
+    if not mul_ans:
+        post_prompt += '"So the final answer is \\boxed{answer}."\n'
+    else:
+        post_prompt += "So the final answer is \\boxed{multiple answers connected with commas}.\n"
+
+    final_question = pre_prompt + question + "\n" + post_prompt
+    return final_question
+
+
+def olympiadbench_process_results(doc, results):
+    precision = doc["error"]
+    is_proving = doc["question_type"] == "Theorem proof" or doc["final_answer"] is None
+    if precision is None:
+        precision = 0
+    prediction = results[0].strip()
+
+    if is_proving:
+        return {"submission": prediction}
+    else:
+        prediction = prediction.split("final answer is")[-1]
+        prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
+        accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
+        accuracy = int(accuracy)
+        return {"exact_match": accuracy}
+
+
+def olympiadbench_aggregate_results(results, args):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
+    submission_file_name = f"olympiadbench-test-en-submission-{now_date_time}.json"
+    path = generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f, ensure_ascii=False)
+    print(f"Submission file saved to {path}")
@@ -0,0 +1,38 @@
+# Copyright 2025 Xiaomi Corporation.
+
+dataset_path: lscpku/OlympiadBench-official
+dataset_kwargs:
+  token: True
+dataset_name: all_no_proof
+task : "olympiadbench_all_boxed"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.olympiadbench_doc_to_visual
+doc_to_text: !function utils.olympiadbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 32768
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.olympiadbench_process_results
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: math_verify
+    aggregation: !function utils.olympiadbench_math_verify_aggregate_results
+    higher_is_better: true
+  - metric: Math_English
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: Math_Chinese
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: Physics_English
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: Physics_Chinese
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
@@ -0,0 +1,38 @@
+# Copyright 2025 Xiaomi Corporation.
+
+dataset_path: lscpku/OlympiadBench-image
+dataset_kwargs:
+  token: True
+dataset_name: all
+task : "olympiadbench_boxed"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.olympiadbench_doc_to_visual
+doc_to_text: !function utils.olympiadbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16384
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.olympiadbench_process_results
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: math_verify
+    aggregation: !function utils.olympiadbench_math_verify_aggregate_results
+    higher_is_better: true
+  - metric: Math_English
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: Math_Chinese
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: Physics_English
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: Physics_Chinese
+    aggregation: !function utils.olympiadbench_aggregate_results
+    higher_is_better: true