1+ #! /bin/bash
2+ # Generate the medical benchmark results table from evaluation logs.
3+ # Usage: bash scripts/generate_results_table.sh
4+
5+ LOG_DIR=" /iopsstor/scratch/cscs/mikhaika/lmms-eval/logs"
6+
7+ # Model definitions: name, log_pattern, total_samples, n_ranks
8+ # For models with multiple runs, we list all logs
9+ cat << 'PYEOF ' | python3 -
10+ import re, os
11+
12+ LOG_DIR = "/iopsstor/scratch/cscs/mikhaika/lmms-eval/logs"
13+
14+ # Define models and their log files
15+ MODELS = {
16+ "Apertus-8B": {"logs": ["eval_apertus_1783312"], "samples": 7827, "ranks": 4, "release": "Aug 2025"},
17+ "Apertus-70B": {"logs": ["eval_apertus_70b_1783313"], "samples": 7827, "ranks": 1, "release": "Aug 2025"},
18+ "Apertus-1.5": {"logs": ["eval_apertus_emu3p5_1677203", "eval_apertus_emu3p5_1617212", "eval_apertus_emu3p5_1772858", "eval_apertus_emu3p5_1773638"], "samples": 25487, "ranks": 4, "release": "—"},
19+ "Llama-3.2": {"logs": ["eval_llama_vision_1675635", "eval_llama_vision_1675655", "eval_llama_vision_1675601", "eval_llama_vision_1683998", "eval_llama_vision_1617171", "eval_llama_vision_1773141", "eval_mmlu_1777590"], "samples": None, "ranks": 4, "release": "Sep 2024"},
20+ "Qwen3-VL-8B": {"logs": ["eval_qwen3_vl_1783308"], "samples": 50559, "ranks": 4, "release": "Jul 2025"},
21+ "Qwen3-VL-30B": {"logs": ["eval_qwen3_vl_30b_1784098"], "samples": 50559, "ranks": 4, "release": "Jul 2025"},
22+ "Qwen3-VL-235B": {"logs": ["eval_qwen3_vl_235b_1819158", "eval_qwen3_vl_235b_1826631"], "samples": 50559, "ranks": 1, "release": "Jul 2025"},
23+ "InternVL3-8B": {"logs": ["eval_internvl3_1783309"], "samples": 50559, "ranks": 4, "release": "Apr 2025"},
24+ "InternVL3-78B": {"logs": ["eval_internvl3_1784070"], "samples": 50559, "ranks": 1, "release": "Apr 2025"},
25+ "Gemma4-26B": {"logs": ["eval_gemma4_26b_1840421"], "samples": 50559, "ranks": 4, "release": "Mar 2026"},
26+ "Gemma4-31B": {"logs": ["eval_gemma4_31b_1849764"], "samples": 50559, "ranks": 4, "release": "Mar 2026"},
27+ "MedGemma-27B": {"logs": ["eval_medgemma_1783311"], "samples": 50559, "ranks": 4, "release": "Jul 2025"},
28+ "No image": {"logs": ["eval_internvl3_no_image_1783823"], "samples": None, "ranks": 4, "release": ""},
29+ }
30+
31+ # Tasks to extract
32+ TASKS = [
33+ ("medmcqa", "Text", "accuracy", "accuracy"),
34+ ("medqa", "Text", "accuracy", "accuracy"),
35+ ("mmlu (medical)", "Text", "exact_match", "exact_match"),
36+ ("pubmedqa", "Text", "accuracy", "accuracy"),
37+ ("pmc_vqa", "VQA", "accuracy", "accuracy"),
38+ ("slake", "VQA", "close_accuracy", "close_accuracy"),
39+ ("path_vqa", "VQA", "close_accuracy", "close_accuracy"),
40+ ("vqa_rad", "VQA", "close_accuracy", "close_accuracy"),
41+ ("path_mmu", "VQA", "accuracy", "accuracy"),
42+ ]
43+
44+ # Task name mapping for log parsing
45+ TASK_LOG_NAMES = {
46+ "path_mmu": "path_mmu_test_tiny",
47+ "mmlu (medical)": "mmlu (medical)",
48+ }
49+
50+ def extract_scores(log_path):
51+ """Extract task scores from a log file."""
52+ scores = {}
53+ try:
54+ with open(log_path) as f:
55+ for line in f:
56+ if not line.startswith("|"):
57+ continue
58+ if "Stderr" in line or "---" in line or "strict-match" in line or " - mmlu_flan" in line:
59+ continue
60+ parts = [p.strip() for p in line.split("|")]
61+ if len(parts) < 9:
62+ continue
63+ task_name = parts[1]
64+ metric = parts[5]
65+ try:
66+ value = float(parts[7])
67+ except:
68+ continue
69+ if value == 0.0 and metric == "exact_match" and "strict" not in line:
70+ continue
71+ scores[(task_name, metric)] = value
72+ except:
73+ pass
74+ return scores
75+
76+ def extract_throughput(log_paths, total_samples):
77+ """Extract inference time from Metric summary lines."""
78+ total_time = 0
79+ for log_path in log_paths:
80+ try:
81+ with open(log_path) as f:
82+ content = f.read()
83+ matches = re.findall(r"Total time: ([\d.]+)s", content)
84+ if matches:
85+ total_time += float(matches[-1])
86+ except:
87+ pass
88+ if total_samples and total_time > 0:
89+ return total_time / total_samples
90+ return None
91+
92+ # Collect all scores
93+ all_scores = {}
94+ all_throughput = {}
95+ for model_name, info in MODELS.items():
96+ model_scores = {}
97+ log_paths = [os.path.join(LOG_DIR, f"{l}.log") for l in info["logs"]]
98+ for log_path in log_paths:
99+ model_scores.update(extract_scores(log_path))
100+ all_scores[model_name] = model_scores
101+ all_throughput[model_name] = extract_throughput(log_paths, info.get("samples"))
102+
103+ # Print table
104+ model_names = list(MODELS.keys())
105+ header = "Task Type Metric " + " ".join(f"{m:>13s}" for m in model_names)
106+ print(header)
107+
108+ # Release dates
109+ release_line = "Release date " + " ".join(f"{MODELS[m]['release']:>13s}" for m in model_names)
110+ print(release_line)
111+
112+ for task_display, task_type, metric_name, metric_display in TASKS:
113+ task_log = TASK_LOG_NAMES.get(task_display, task_display)
114+ values = []
115+ for model_name in model_names:
116+ scores = all_scores[model_name]
117+ key = (task_log, metric_name)
118+ # Try variations
119+ val = scores.get(key)
120+ if val is None:
121+ # Try with _no_image suffix
122+ if model_name == "No image":
123+ key2 = (task_log + "_no_image", metric_name)
124+ val = scores.get(key2)
125+ if val is not None:
126+ values.append(f"{val*100:.2f}%")
127+ else:
128+ values.append("—")
129+ line = f"{task_display:17s}{task_type:8s}{metric_display:16s}" + " ".join(f"{v:>13s}" for v in values)
130+ print(line)
131+
132+ # Throughput
133+ tp_values = []
134+ for model_name in model_names:
135+ tp = all_throughput[model_name]
136+ if tp is not None:
137+ tp_values.append(f"{tp:.3f}")
138+ else:
139+ tp_values.append("—")
140+ tp_line = "Throughput (s/sam) " + " ".join(f"{v:>13s}" for v in tp_values)
141+ print(tp_line)
142+
143+ print()
144+ print("†mmlu underestimated — output format mismatch with FLAN filter.")
145+ print("‡No image baseline: InternVL3-8B.")
146+ print()
147+ print("Logs:")
148+ for model_name, info in MODELS.items():
149+ if model_name == "No image":
150+ continue
151+ logs_str = ", ".join(info["logs"])
152+ print(f" {model_name:20s} {logs_str}")
153+ PYEOF
0 commit comments