Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 9 additions & 68 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,11 @@ def get_gpu_list():
from vlmeval.inference import infer_data_job
from vlmeval.inference_mt import infer_data_job_mt
from vlmeval.inference_video import infer_data_job_video
from vlmeval.judge import get_default_judge_model
from vlmeval.smp import (MMBenchOfficialServer, build_eval_id, collect_run_benchmark_report,
get_eval_file_format, get_logger, get_pred_file_format,
get_pred_file_path, githash, is_prediction_complete, listinstr, load,
load_env, prepare_reuse_files, proxy_set, setup_logger, timestr,
get_pred_file_path, githash, is_prediction_complete, load, load_env,
prepare_reuse_files, proxy_set, setup_logger, timestr,
upsert_dataset_status, upsert_run_status)
from vlmeval.utils.result_transfer import MMMU_result_transfer, MMTBench_result_transfer

Expand Down Expand Up @@ -221,7 +222,7 @@ def build_model_from_base_url(args):
return model_args


def get_judge_kwargs(dataset_name, dataset_type, args):
def get_judge_kwargs(dataset_name, dataset_type, args, dataset=None):
"""Determine judge kwargs based on dataset name and type.

Uses run.py's logic as the canonical source for dataset-specific judge model
Expand Down Expand Up @@ -256,69 +257,9 @@ def get_judge_kwargs(dataset_name, dataset_type, args):
if args.judge is not None:
judge_kwargs['model'] = args.judge
else:
if dataset_type in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro'] or listinstr(
['moviechat1k', 'mme-reasoning'], dataset_name.lower()
):
if listinstr(['WeMath', 'MME-Reasoning'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['VisualPuzzles'], dataset_name):
judge_kwargs['model'] = 'exact_matching'
elif listinstr(['PuzzleVQA'], dataset_name):
judge_kwargs['model'] = 'exact_matching'
elif listinstr(['VisuLogic'], dataset_name):
judge_kwargs['model'] = 'exact_matching'
else:
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['MMVet', 'LLaVABench', 'MMBench_Video'], dataset_name):
if listinstr(['LLaVABench_KO'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-0806'
else:
judge_kwargs['model'] = 'gpt-4-turbo'
elif listinstr(['VGRPBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(
['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench',
'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius',
'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name
):
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['OlympiadBench'], dataset_name):
use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False)
if use_api_judger:
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(
['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench',
'WildVision', 'MMAlignBench', 'MM-IFEval'], dataset_name
):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['ChartMimic'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['VDC'], dataset_name):
judge_kwargs['model'] = 'llama31-8b'
elif listinstr(['Video_MMLU_QA', 'Video_MMLU_CAP'], dataset_name):
judge_kwargs['model'] = 'qwen-72b'
elif listinstr(['MMVMBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['CVQA_EN', 'CVQA_LOC'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1'
elif listinstr(['M4Bench'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['AyaVisionBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1'
elif listinstr(['MathCanvas'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1-2025-04-14'
elif listinstr(['MMReason'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1'
elif listinstr(['CoreCognition'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1'
elif listinstr(['WorldVQA'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-1120'
elif listinstr(['Video-MME'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['MaCBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['SciDocBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-mini'
judge_model = get_default_judge_model(dataset, dataset_type, judge_kwargs)
if judge_model is not None:
judge_kwargs['model'] = judge_model

if args.use_verifier:
judge_kwargs['use_verifier'] = True
Expand Down Expand Up @@ -655,7 +596,7 @@ def run_local_mode(args):
)
continue

judge_kwargs = get_judge_kwargs(dataset_name, dataset.TYPE, args)
judge_kwargs = get_judge_kwargs(dataset_name, dataset.TYPE, args, dataset=dataset)
judge_model = judge_kwargs.get('model', '')

if RANK == 0:
Expand Down Expand Up @@ -1041,7 +982,7 @@ def run_api_mode(args):
logger.info(f'{ds_name} requires special handling, skipped in pipeline.')
continue

judge_kwargs = get_judge_kwargs(ds_name, dataset.TYPE, args)
judge_kwargs = get_judge_kwargs(ds_name, dataset.TYPE, args, dataset=dataset)
judge_model = judge_kwargs.get('model', '')
logger.info(f'Judge kwargs: {judge_kwargs}')

Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/asclepius.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Asclepius(ImageVQADataset):
"""

TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
MODALITY = 'IMAGE'

DATASET_URL = {
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/chartmimic.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ def _judge_one_item(item):

class ChartMimic(ImageBaseDataset):
TYPE = "VQA"
DEFAULT_JUDGE_MODEL = 'gpt-4o'

# TODO: add dataset url and md5
DATASET_URL = {
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/dude.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def DUDE_acc(result_file):
class DUDE(ImageBaseDataset):

TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o'

DATASET_URL = {
'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv',
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/dynamath.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def DynaMath_auxeval(model, line):
class Dynamath(ImageBaseDataset):

TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv',
'DynaMath_noprompt': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv',
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/flames.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def _flames_judge(model, dimension, question, response):

class FlamesDataset(TextBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
MODALITY = 'TEXT'
DATASET_URL = {'Flames': 'https://opencompass.openxlab.space/utils/VLMEval/Flames.tsv'}
DATASET_MD5 = {'Flames': 'b567b6c96717c9e6c8bb9b458a85635a'}
Expand Down
4 changes: 4 additions & 0 deletions vlmeval/dataset/image_mcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -1674,6 +1674,7 @@ def build_prompt(self, line):

class VisualPuzzles(ImageMCQDataset):
TYPE = "MCQ"
DEFAULT_JUDGE_MODEL = 'exact_matching'
DATASET_URL = {
'VisualPuzzles': 'https://opencompass.openxlab.space/utils/VLMEval/VisualPuzzles.tsv'
}
Expand Down Expand Up @@ -1771,6 +1772,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class PuzzleVQA(ImageMCQDataset):
TYPE = "MCQ"
DEFAULT_JUDGE_MODEL = 'exact_matching'
DATASET_URL = {
'PuzzleVQA': 'https://opencompass.openxlab.space/utils/VLMEval/PuzzleVQA.tsv'
}
Expand Down Expand Up @@ -1842,6 +1844,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class VisuLogic(ImageMCQDataset):
TYPE = "MCQ"
DEFAULT_JUDGE_MODEL = 'exact_matching'
DATASET_URL = {
'VisuLogic': 'https://opencompass.openxlab.space/utils/VLMEval/VisuLogic.tsv'
}
Expand Down Expand Up @@ -3044,6 +3047,7 @@ def compute_iou(box1, box2):


class CVQA(ImageMCQDataset):
DEFAULT_JUDGE_MODEL = 'gpt-4.1'

@classmethod
def supported_datasets(cls):
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/image_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def build_prompt(self, line):

class MMDUDataset(ImageMTDataset):

DEFAULT_JUDGE_MODEL = 'gpt-4o'
DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
DIMS = [
Expand Down
18 changes: 18 additions & 0 deletions vlmeval/dataset/image_vqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class VTCBench(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
_DATASET_PATH = "https://huggingface.co/datasets/MLLM-CL/VTCBench"
# Dataset URL mapping - points to different splits of HuggingFace dataset
DATASET_URL = {
Expand Down Expand Up @@ -571,6 +572,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MathVista(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'MathVista_MINI':
'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
Expand Down Expand Up @@ -700,6 +702,7 @@ def MathVista_acc_verifier(result_file):

class MathVerse(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'MathVerse_MINI':
'https://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
Expand Down Expand Up @@ -842,6 +845,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MathVision(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'MathVision':
'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
Expand Down Expand Up @@ -984,6 +988,7 @@ def report_primary_metric(cls, metrics: dict | None) -> dict:

class LENS(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'LENS-CN-QA':
'https://huggingface.co/datasets/songlier/LENS/resolve/main/LENS-CN-QA.tsv',
Expand Down Expand Up @@ -1272,6 +1277,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class OlympiadBench(ImageBaseDataset):
TYPE = 'VQA_ex_prompt'
DEFAULT_JUDGE_MODEL = {'olympiad_use_api_judger': 'gpt-4o-mini'}
DATASET_URL = {
'OlympiadBench':
'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
Expand Down Expand Up @@ -1702,6 +1708,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class LogicVista(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'LogicVista':
'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv'
Expand Down Expand Up @@ -1891,6 +1898,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class LLaVABench(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4-turbo'
DATASET_URL = {
'LLaVABench':
'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'
Expand Down Expand Up @@ -1932,6 +1940,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class LLaVABench_KO(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-0806'
DATASET_URL = {
'LLaVABench_KO':
'https://huggingface.co/datasets/NCSOFT/K-LLaVA-W/resolve/main/LLaVABench_KO.tsv'
Expand Down Expand Up @@ -1974,6 +1983,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class VGRPBench(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o'

DATASET_URL = {
'VGRPBench':
Expand Down Expand Up @@ -2036,6 +2046,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MMVet(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4-turbo'
DATASET_URL = {
'MMVet':
'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv',
Expand Down Expand Up @@ -3315,6 +3326,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class OCR_Reasoning(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {
'OCR_Reasoning':
'https://opencompass.openxlab.space/utils/VLMEval/OCR_Reasoning.tsv'
Expand Down Expand Up @@ -3607,6 +3619,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MMEReasoning(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {'MME-Reasoning': 'https://huggingface.co/datasets/U4R/MME-Reasoning/blob/main/MME_Reasoning.tsv'}
DATASET_MD = {'MME-Reasoning': 'b243f44778782d3821523689f6b40a1e'}

Expand Down Expand Up @@ -3802,6 +3815,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MMVMBench(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o'
DATASET_URL = {
'MMVMBench':
'https://opencompass.openxlab.space/utils/VLMEval/MMVMBench.tsv'
Expand Down Expand Up @@ -3984,6 +3998,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class AyaVisionBench(ImageVQADataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4.1'
DATASET_URL = {
"AyaVisionBench":
"https://huggingface.co/datasets/timothycdc/"
Expand Down Expand Up @@ -4063,6 +4078,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MathCanvas(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4.1-2025-04-14'
DATASET_URL = {
"MathCanvas-Bench":
"https://huggingface.co/datasets/shiwk24/MathCanvas-Bench/resolve/main/MathCanvas_Bench_VLMEvalKit.tsv"
Expand Down Expand Up @@ -4161,6 +4177,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class MMReason(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4.1'
mini_path = 'https://huggingface.co/datasets/HuanjinYao/MMReason/resolve/main/MMReason_testmini.tsv?download=true'
DATASET_URL = {
'MMReason_testmini': mini_path,
Expand Down Expand Up @@ -4255,6 +4272,7 @@ def evaluate(self, eval_file, **judge_kwargs):

class CoreCognition(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4.1'

DATASET_URL = {
'CoreCognition': 'https://huggingface.co/datasets/ZTWHHH/CoreCognition/resolve/main/CoreCognition.tsv'
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/m4bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class M4Bench(ImageBaseDataset):
Dataset class for M4Bench, handling single and dual image inputs.
"""
TYPE = 'M4Bench'
DEFAULT_JUDGE_MODEL = 'gpt-4o'

DATASET_URL = {
"State_Invariance": "https://huggingface.co/datasets/Anonymous8976/M4Bench/resolve/main/State_Invariance.tsv", # noqa: E501
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/macbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def macbench_auxeval(model, line):

class MaCBench(ImageBaseDataset):
TYPE = 'MaCBench'
DEFAULT_JUDGE_MODEL = 'gpt-4o-mini'
DATASET_URL = {'MaCBench': ''}
DATASET_MD5 = {'MaCBench': '0e163396dd28886fd828e101f24afdf6'}

Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/miabench.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def get_score_dict(data, score_raw):

class MIABench(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o'

DATASET_URL = {
'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/mmalignbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def MMAlignBench_auxeval(model, line):

class MMAlignBench(ImageBaseDataset):
TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o'
DATASET_URL = {'MMAlignBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMAlignBench.tsv'}
DATASET_MD5 = {'MMAlignBench': 'd00d8e61c99257cbaf76d8d5e926f01e'}

Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/mmbench_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class MMBenchVideo(VideoBaseDataset):
"""

TYPE = 'Video-VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4-turbo'

def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1):
super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps)
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/mmifeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def judge_one_item(item, retry=3):

class MMIFEval(ImageBaseDataset):
TYPE = "VQA"
DEFAULT_JUDGE_MODEL = 'gpt-4o'

# TODO: add dataset url and md5
DATASET_URL = {"MM-IFEval": 'https://opencompass.openxlab.space/utils/VLMEval/MM-IFEval.tsv'}
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/mmlongbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ def MMLongBench_acc(result_file):
class MMLongBench(ImageBaseDataset):

TYPE = 'VQA'
DEFAULT_JUDGE_MODEL = 'gpt-4o'

DATASET_URL = {
'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
Expand Down
Loading
Loading