Skip to content

Commit 3ebd266

Browse files
committed
new models
1 parent 9859a8a commit 3ebd266

7 files changed

Lines changed: 334 additions & 1 deletion

File tree

examples/slurm/gemma4_26b.slurm

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#SBATCH --account=infra01
3+
#SBATCH --job-name=eval-gemma4-26b
4+
#SBATCH --environment=/users/mikhaika/.edf/lmms_eval_vllm.toml
5+
#SBATCH --nodes=1
6+
#SBATCH --exclusive
7+
#SBATCH --partition=normal
8+
#SBATCH --ntasks-per-node=1
9+
#SBATCH --cpus-per-task=288
10+
#SBATCH --time=12:00:00
11+
#SBATCH --output=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_gemma4_26b_%j.out
12+
#SBATCH --error=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_gemma4_26b_%j.err
13+
LOG_BASE=/iopsstor/scratch/cscs/$USER/lmms-eval/logs/eval_gemma4_26b_$SLURM_JOB_ID
14+
exec > >(tee -a ${LOG_BASE}.log)
15+
exec 2> >(tee -a ${LOG_BASE}.log >&2)
16+
17+
TASKS="pmc_vqa,slake,vqa_rad,medqa,medmcqa,pubmedqa,mmlu_medical,path_vqa,path_mmu_test_tiny"
18+
BATCH_SIZE=64
19+
MODEL="vllm"
20+
MODEL_ARGS="model=google/gemma-4-26B-A4B-it,tensor_parallel_size=1,gpu_memory_utilization=0.9,dtype=bfloat16,max_model_len=16384"
21+
RES_PATH="/iopsstor/scratch/cscs/$USER/PDM/results/lmms_eval/gemma4_26b_results"
22+
EVAL_DIR=/iopsstor/scratch/cscs/$USER/lmms-eval
23+
24+
cd "$EVAL_DIR" || exit
25+
pip uninstall jupyterlab -y
26+
unset PIP_CONSTRAINT
27+
pip install -e .
28+
29+
torchrun --nproc_per_node=4 -m lmms_eval \
30+
--model "${MODEL}" \
31+
--model_args "${MODEL_ARGS}" \
32+
--tasks "${TASKS}" \
33+
--batch_size "${BATCH_SIZE}" \
34+
--output_path "${RES_PATH}"
35+
36+
echo "End Reached"

examples/slurm/gemma4_31b.slurm

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#SBATCH --account=infra01
3+
#SBATCH --job-name=eval-gemma4-31b
4+
#SBATCH --environment=/users/mikhaika/.edf/lmms_eval_vllm.toml
5+
#SBATCH --nodes=1
6+
#SBATCH --exclusive
7+
#SBATCH --partition=normal
8+
#SBATCH --ntasks-per-node=1
9+
#SBATCH --cpus-per-task=288
10+
#SBATCH --time=12:00:00
11+
#SBATCH --output=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_gemma4_31b_%j.out
12+
#SBATCH --error=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_gemma4_31b_%j.err
13+
LOG_BASE=/iopsstor/scratch/cscs/$USER/lmms-eval/logs/eval_gemma4_31b_$SLURM_JOB_ID
14+
exec > >(tee -a ${LOG_BASE}.log)
15+
exec 2> >(tee -a ${LOG_BASE}.log >&2)
16+
17+
TASKS="pmc_vqa,slake,vqa_rad,medqa,medmcqa,pubmedqa,mmlu_medical,path_vqa,path_mmu_test_tiny"
18+
BATCH_SIZE=64
19+
MODEL="vllm"
20+
MODEL_ARGS="model=google/gemma-4-31B-it,tensor_parallel_size=1,gpu_memory_utilization=0.9,dtype=bfloat16,max_model_len=16384"
21+
RES_PATH="/iopsstor/scratch/cscs/$USER/PDM/results/lmms_eval/gemma4_31b_results"
22+
EVAL_DIR=/iopsstor/scratch/cscs/$USER/lmms-eval
23+
24+
cd "$EVAL_DIR" || exit
25+
pip uninstall jupyterlab -y
26+
unset PIP_CONSTRAINT
27+
pip install -e .
28+
29+
torchrun --nproc_per_node=4 -m lmms_eval \
30+
--model "${MODEL}" \
31+
--model_args "${MODEL_ARGS}" \
32+
--tasks "${TASKS}" \
33+
--batch_size "${BATCH_SIZE}" \
34+
--output_path "${RES_PATH}"
35+
36+
echo "End Reached"
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#!/bin/bash
2+
# Generate the medical benchmark results table from evaluation logs.
3+
# Usage: bash scripts/generate_results_table.sh
4+
5+
LOG_DIR="/iopsstor/scratch/cscs/mikhaika/lmms-eval/logs"
6+
7+
# Model definitions: name, log_pattern, total_samples, n_ranks
8+
# For models with multiple runs, we list all logs
9+
cat << 'PYEOF' | python3 -
10+
import re, os
11+
12+
LOG_DIR = "/iopsstor/scratch/cscs/mikhaika/lmms-eval/logs"
13+
14+
# Define models and their log files
15+
MODELS = {
16+
"Apertus-8B": {"logs": ["eval_apertus_1783312"], "samples": 7827, "ranks": 4, "release": "Aug 2025"},
17+
"Apertus-70B": {"logs": ["eval_apertus_70b_1783313"], "samples": 7827, "ranks": 1, "release": "Aug 2025"},
18+
"Apertus-1.5": {"logs": ["eval_apertus_emu3p5_1677203", "eval_apertus_emu3p5_1617212", "eval_apertus_emu3p5_1772858", "eval_apertus_emu3p5_1773638"], "samples": 25487, "ranks": 4, "release": "—"},
19+
"Llama-3.2": {"logs": ["eval_llama_vision_1675635", "eval_llama_vision_1675655", "eval_llama_vision_1675601", "eval_llama_vision_1683998", "eval_llama_vision_1617171", "eval_llama_vision_1773141", "eval_mmlu_1777590"], "samples": None, "ranks": 4, "release": "Sep 2024"},
20+
"Qwen3-VL-8B": {"logs": ["eval_qwen3_vl_1783308"], "samples": 50559, "ranks": 4, "release": "Jul 2025"},
21+
"Qwen3-VL-30B": {"logs": ["eval_qwen3_vl_30b_1784098"], "samples": 50559, "ranks": 4, "release": "Jul 2025"},
22+
"Qwen3-VL-235B": {"logs": ["eval_qwen3_vl_235b_1819158", "eval_qwen3_vl_235b_1826631"], "samples": 50559, "ranks": 1, "release": "Jul 2025"},
23+
"InternVL3-8B": {"logs": ["eval_internvl3_1783309"], "samples": 50559, "ranks": 4, "release": "Apr 2025"},
24+
"InternVL3-78B": {"logs": ["eval_internvl3_1784070"], "samples": 50559, "ranks": 1, "release": "Apr 2025"},
25+
"Gemma4-26B": {"logs": ["eval_gemma4_26b_1840421"], "samples": 50559, "ranks": 4, "release": "Mar 2026"},
26+
"Gemma4-31B": {"logs": ["eval_gemma4_31b_1849764"], "samples": 50559, "ranks": 4, "release": "Mar 2026"},
27+
"MedGemma-27B": {"logs": ["eval_medgemma_1783311"], "samples": 50559, "ranks": 4, "release": "Jul 2025"},
28+
"No image": {"logs": ["eval_internvl3_no_image_1783823"], "samples": None, "ranks": 4, "release": ""},
29+
}
30+
31+
# Tasks to extract
32+
TASKS = [
33+
("medmcqa", "Text", "accuracy", "accuracy"),
34+
("medqa", "Text", "accuracy", "accuracy"),
35+
("mmlu (medical)", "Text", "exact_match", "exact_match"),
36+
("pubmedqa", "Text", "accuracy", "accuracy"),
37+
("pmc_vqa", "VQA", "accuracy", "accuracy"),
38+
("slake", "VQA", "close_accuracy", "close_accuracy"),
39+
("path_vqa", "VQA", "close_accuracy", "close_accuracy"),
40+
("vqa_rad", "VQA", "close_accuracy", "close_accuracy"),
41+
("path_mmu", "VQA", "accuracy", "accuracy"),
42+
]
43+
44+
# Task name mapping for log parsing
45+
TASK_LOG_NAMES = {
46+
"path_mmu": "path_mmu_test_tiny",
47+
"mmlu (medical)": "mmlu (medical)",
48+
}
49+
50+
def extract_scores(log_path):
51+
"""Extract task scores from a log file."""
52+
scores = {}
53+
try:
54+
with open(log_path) as f:
55+
for line in f:
56+
if not line.startswith("|"):
57+
continue
58+
if "Stderr" in line or "---" in line or "strict-match" in line or " - mmlu_flan" in line:
59+
continue
60+
parts = [p.strip() for p in line.split("|")]
61+
if len(parts) < 9:
62+
continue
63+
task_name = parts[1]
64+
metric = parts[5]
65+
try:
66+
value = float(parts[7])
67+
except:
68+
continue
69+
if value == 0.0 and metric == "exact_match" and "strict" not in line:
70+
continue
71+
scores[(task_name, metric)] = value
72+
except:
73+
pass
74+
return scores
75+
76+
def extract_throughput(log_paths, total_samples):
77+
"""Extract inference time from Metric summary lines."""
78+
total_time = 0
79+
for log_path in log_paths:
80+
try:
81+
with open(log_path) as f:
82+
content = f.read()
83+
matches = re.findall(r"Total time: ([\d.]+)s", content)
84+
if matches:
85+
total_time += float(matches[-1])
86+
except:
87+
pass
88+
if total_samples and total_time > 0:
89+
return total_time / total_samples
90+
return None
91+
92+
# Collect all scores
93+
all_scores = {}
94+
all_throughput = {}
95+
for model_name, info in MODELS.items():
96+
model_scores = {}
97+
log_paths = [os.path.join(LOG_DIR, f"{l}.log") for l in info["logs"]]
98+
for log_path in log_paths:
99+
model_scores.update(extract_scores(log_path))
100+
all_scores[model_name] = model_scores
101+
all_throughput[model_name] = extract_throughput(log_paths, info.get("samples"))
102+
103+
# Print table
104+
model_names = list(MODELS.keys())
105+
header = "Task Type Metric " + " ".join(f"{m:>13s}" for m in model_names)
106+
print(header)
107+
108+
# Release dates
109+
release_line = "Release date " + " ".join(f"{MODELS[m]['release']:>13s}" for m in model_names)
110+
print(release_line)
111+
112+
for task_display, task_type, metric_name, metric_display in TASKS:
113+
task_log = TASK_LOG_NAMES.get(task_display, task_display)
114+
values = []
115+
for model_name in model_names:
116+
scores = all_scores[model_name]
117+
key = (task_log, metric_name)
118+
# Try variations
119+
val = scores.get(key)
120+
if val is None:
121+
# Try with _no_image suffix
122+
if model_name == "No image":
123+
key2 = (task_log + "_no_image", metric_name)
124+
val = scores.get(key2)
125+
if val is not None:
126+
values.append(f"{val*100:.2f}%")
127+
else:
128+
values.append("—")
129+
line = f"{task_display:17s}{task_type:8s}{metric_display:16s}" + " ".join(f"{v:>13s}" for v in values)
130+
print(line)
131+
132+
# Throughput
133+
tp_values = []
134+
for model_name in model_names:
135+
tp = all_throughput[model_name]
136+
if tp is not None:
137+
tp_values.append(f"{tp:.3f}")
138+
else:
139+
tp_values.append("—")
140+
tp_line = "Throughput (s/sam) " + " ".join(f"{v:>13s}" for v in tp_values)
141+
print(tp_line)
142+
143+
print()
144+
print("†mmlu underestimated — output format mismatch with FLAN filter.")
145+
print("‡No image baseline: InternVL3-8B.")
146+
print()
147+
print("Logs:")
148+
for model_name, info in MODELS.items():
149+
if model_name == "No image":
150+
continue
151+
logs_str = ", ".join(info["logs"])
152+
print(f" {model_name:20s} {logs_str}")
153+
PYEOF

examples/slurm/internvl3_78b.slurm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ exec 2> >(tee -a ${LOG_BASE}.log >&2)
2121
TASKS="pmc_vqa,slake,vqa_rad,medqa,medmcqa,pubmedqa,mmlu_medical,path_vqa,path_mmu_test_tiny"
2222
BATCH_SIZE=16
2323
MODEL="vllm"
24-
MODEL_ARGS="model=OpenGVLab/InternVL3-78B,tensor_parallel_size=4,gpu_memory_utilization=0.95,dtype=bfloat16,max_model_len=8192"
24+
MODEL_ARGS="model=OpenGVLab/InternVL3-78B,tensor_parallel_size=4,gpu_memory_utilization=0.95,dtype=bfloat16,max_model_len=16384"
2525
RES_PATH="/iopsstor/scratch/cscs/$USER/PDM/results/lmms_eval/internvl3_78b_results"
2626
EVAL_DIR=/iopsstor/scratch/cscs/$USER/lmms-eval
2727
HF_DATASETS_OFFLINE=0

examples/slurm/meditron3_70b.slurm

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#SBATCH --account=infra01
3+
#SBATCH --job-name=eval-meditron3
4+
#SBATCH --environment=/users/mikhaika/.edf/lmms_eval_vllm.toml
5+
#SBATCH --nodes=1
6+
#SBATCH --exclusive
7+
#SBATCH --partition=normal
8+
#SBATCH --ntasks-per-node=1
9+
#SBATCH --cpus-per-task=288
10+
#SBATCH --time=12:00:00
11+
#SBATCH --output=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_meditron3_70b_%j.out
12+
#SBATCH --error=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_meditron3_70b_%j.err
13+
LOG_BASE=/iopsstor/scratch/cscs/$USER/lmms-eval/logs/eval_meditron3_70b_$SLURM_JOB_ID
14+
exec > >(tee -a ${LOG_BASE}.log)
15+
exec 2> >(tee -a ${LOG_BASE}.log >&2)
16+
17+
TASKS="medqa,medmcqa,pubmedqa,mmlu_medical"
18+
BATCH_SIZE=16
19+
MODEL="vllm"
20+
MODEL_ARGS="model=OpenMeditron/Meditron3-70B,tensor_parallel_size=4,gpu_memory_utilization=0.95,dtype=bfloat16,max_model_len=8192"
21+
RES_PATH="/iopsstor/scratch/cscs/$USER/PDM/results/lmms_eval/meditron3_70b_results"
22+
EVAL_DIR=/iopsstor/scratch/cscs/$USER/lmms-eval
23+
24+
cd "$EVAL_DIR" || exit
25+
pip uninstall jupyterlab -y
26+
unset PIP_CONSTRAINT
27+
pip install -e .
28+
29+
python3 -m lmms_eval \
30+
--model "${MODEL}" \
31+
--model_args "${MODEL_ARGS}" \
32+
--tasks "${TASKS}" \
33+
--batch_size "${BATCH_SIZE}" \
34+
--output_path "${RES_PATH}"
35+
36+
echo "End Reached"

examples/slurm/qwen3_vl_235b.slurm

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#SBATCH --account=infra01
3+
#SBATCH --job-name=eval-qwen235b
4+
#SBATCH --environment=/users/mikhaika/.edf/lmms_eval_vllm.toml
5+
#SBATCH --nodes=1
6+
#SBATCH --exclusive
7+
#SBATCH --partition=normal
8+
#SBATCH --ntasks-per-node=1
9+
#SBATCH --cpus-per-task=288
10+
#SBATCH --time=12:00:00
11+
#SBATCH --output=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_qwen3_vl_235b_%j.out
12+
#SBATCH --error=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_qwen3_vl_235b_%j.err
13+
LOG_BASE=/iopsstor/scratch/cscs/$USER/lmms-eval/logs/eval_qwen3_vl_235b_$SLURM_JOB_ID
14+
exec > >(tee -a ${LOG_BASE}.log)
15+
exec 2> >(tee -a ${LOG_BASE}.log >&2)
16+
17+
TASKS="medqa,medmcqa,pubmedqa,mmlu_medical"
18+
BATCH_SIZE=16
19+
MODEL="vllm"
20+
MODEL_ARGS="model=Qwen/Qwen3-VL-235B-A22B-Instruct-FP8,tensor_parallel_size=4,gpu_memory_utilization=0.85,dtype=auto,max_model_len=16384"
21+
RES_PATH="/iopsstor/scratch/cscs/$USER/PDM/results/lmms_eval/qwen3_vl_235b_results"
22+
EVAL_DIR=/iopsstor/scratch/cscs/$USER/lmms-eval
23+
24+
cd "$EVAL_DIR" || exit
25+
pip uninstall jupyterlab -y
26+
unset PIP_CONSTRAINT
27+
pip install -e .
28+
29+
python3 -m lmms_eval \
30+
--model "${MODEL}" \
31+
--model_args "${MODEL_ARGS}" \
32+
--tasks "${TASKS}" \
33+
--batch_size "${BATCH_SIZE}" \
34+
--output_path "${RES_PATH}"
35+
36+
echo "End Reached"

examples/slurm/qwen3_vl_30b.slurm

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#SBATCH --account=infra01
3+
#SBATCH --job-name=eval-qwen3vl30b
4+
#SBATCH --environment=/users/mikhaika/.edf/lmms_eval_vllm.toml
5+
#SBATCH --nodes=1
6+
#SBATCH --exclusive
7+
#SBATCH --partition=normal
8+
#SBATCH --ntasks-per-node=1
9+
#SBATCH --cpus-per-task=288
10+
#SBATCH --time=12:00:00
11+
#SBATCH --output=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_qwen3_vl_30b_%j.out
12+
#SBATCH --error=/iopsstor/scratch/cscs/%u/lmms-eval/logs/eval_qwen3_vl_30b_%j.err
13+
LOG_BASE=/iopsstor/scratch/cscs/$USER/lmms-eval/logs/eval_qwen3_vl_30b_$SLURM_JOB_ID
14+
exec > >(tee -a ${LOG_BASE}.log)
15+
exec 2> >(tee -a ${LOG_BASE}.log >&2)
16+
17+
TASKS="pmc_vqa,slake,vqa_rad,medqa,medmcqa,pubmedqa,mmlu_medical,path_vqa,path_mmu_test_tiny"
18+
BATCH_SIZE=64
19+
MODEL="vllm"
20+
MODEL_ARGS="model=Qwen/Qwen3-VL-30B-A3B-Instruct,tensor_parallel_size=1,gpu_memory_utilization=0.9,dtype=bfloat16,max_model_len=16384"
21+
RES_PATH="/iopsstor/scratch/cscs/$USER/PDM/results/lmms_eval/qwen3_vl_30b_results"
22+
EVAL_DIR=/iopsstor/scratch/cscs/$USER/lmms-eval
23+
24+
cd "$EVAL_DIR" || exit
25+
pip uninstall jupyterlab -y
26+
unset PIP_CONSTRAINT
27+
pip install -e .
28+
29+
torchrun --nproc_per_node=4 -m lmms_eval \
30+
--model "${MODEL}" \
31+
--model_args "${MODEL_ARGS}" \
32+
--tasks "${TASKS}" \
33+
--batch_size "${BATCH_SIZE}" \
34+
--output_path "${RES_PATH}"
35+
36+
echo "End Reached"

0 commit comments

Comments
 (0)