Skip to content

Commit b680326

Browse files
fix: add exception handling for corrupted terms in obo files.
1 parent 98f904b commit b680326

2 files changed

Lines changed: 386 additions & 5 deletions

File tree

benchmark/benchmark_memory.py

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
#!/usr/bin/env python3
2+
"""Benchmark memory required to load CHEBI with memray, per backend.
3+
4+
This script has two modes:
5+
1) Driver mode (default): orchestrates memray runs, generates HTML reports,
6+
parses peak memory, and prints/saves a summary.
7+
2) Worker mode (--worker): performs only `ClientOntology.load(...)`.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import argparse
13+
import csv
14+
import json
15+
import re
16+
import shutil
17+
import statistics
18+
import subprocess
19+
import sys
20+
from datetime import datetime
21+
from pathlib import Path
22+
23+
from ontograph.client import ClientOntology
24+
25+
BACKENDS = ('pronto', 'graphblas')
26+
DEFAULT_ONTOLOGY_ID = 'chebi'
27+
DEFAULT_CACHE_DIR = './data/out'
28+
DEFAULT_REPETITIONS = 5
29+
DEFAULT_RESULTS_DIR = './benchmark/results'
30+
31+
32+
def _to_mb(value: float, unit: str) -> float:
33+
unit = unit.upper()
34+
factors = {
35+
'B': 1 / (1024.0 * 1024.0),
36+
'KB': 1 / 1024.0,
37+
'MB': 1.0,
38+
'GB': 1024.0,
39+
'TB': 1024.0 * 1024.0,
40+
}
41+
if unit not in factors:
42+
raise ValueError(f'Unsupported unit: {unit}')
43+
return value * factors[unit]
44+
45+
46+
def _parse_peak_memory_mb(memray_stats_output: str) -> float | None:
47+
match = re.search(
48+
r'Peak memory usage:\s*([0-9]+(?:\.[0-9]+)?)\s*([KMGTP]?B)',
49+
memray_stats_output,
50+
)
51+
if not match:
52+
return None
53+
value = float(match.group(1))
54+
unit = match.group(2)
55+
return _to_mb(value, unit)
56+
57+
58+
def _run_command(cmd: list[str]) -> subprocess.CompletedProcess:
59+
return subprocess.run(
60+
cmd,
61+
check=True,
62+
capture_output=True,
63+
text=True,
64+
)
65+
66+
67+
def _prime_cache(cache_dir: str, ontology_id: str) -> None:
68+
client = ClientOntology(cache_dir=cache_dir)
69+
client.load(source=ontology_id, backend='pronto')
70+
71+
72+
def _resolve_local_source(cache_dir: str, ontology_id: str) -> Path:
73+
cache_path = Path(cache_dir)
74+
preferred = cache_path / f'{ontology_id.lower()}.obo'
75+
if preferred.exists():
76+
return preferred
77+
78+
candidates = sorted(
79+
list(cache_path.glob(f'{ontology_id.lower()}*.obo'))
80+
+ list(cache_path.glob(f'*{ontology_id.lower()}*.obo'))
81+
)
82+
if candidates:
83+
return candidates[0]
84+
85+
raise FileNotFoundError(
86+
f'Could not find local OBO file for "{ontology_id}" in {cache_path}. '
87+
'Prime cache first or pass --source-file.'
88+
)
89+
90+
91+
def _worker_load(
92+
backend: str, source: str, cache_dir: str, include_obsolete: bool
93+
) -> None:
94+
client = ClientOntology(cache_dir=cache_dir)
95+
client.load(
96+
source=source, backend=backend, include_obsolete=include_obsolete
97+
)
98+
99+
100+
def _ensure_memray_installed() -> None:
101+
if shutil.which('memray') is None:
102+
raise RuntimeError(
103+
'memray executable not found. Install it (e.g., `uv add --dev memray`).'
104+
)
105+
106+
107+
def _backend_rows(rows: list[dict], backend: str) -> list[dict]:
108+
return [row for row in rows if row['backend'] == backend]
109+
110+
111+
def _write_artifacts(rows: list[dict], outdir: Path) -> None:
112+
outdir.mkdir(parents=True, exist_ok=True)
113+
114+
json_path = outdir / 'summary.json'
115+
with open(json_path, 'w', encoding='utf-8') as f:
116+
json.dump(rows, f, indent=2)
117+
118+
csv_path = outdir / 'summary.csv'
119+
fieldnames = [
120+
'backend',
121+
'run',
122+
'status',
123+
'peak_memory_mb',
124+
'trace_file',
125+
'flamegraph_html',
126+
'leaks_flamegraph_html',
127+
'error',
128+
]
129+
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
130+
writer = csv.DictWriter(f, fieldnames=fieldnames)
131+
writer.writeheader()
132+
for row in rows:
133+
writer.writerow(
134+
{
135+
'backend': row['backend'],
136+
'run': row['run'],
137+
'status': row['status'],
138+
'peak_memory_mb': row.get('peak_memory_mb'),
139+
'trace_file': row.get('trace_file'),
140+
'flamegraph_html': row.get('flamegraph_html'),
141+
'leaks_flamegraph_html': row.get('leaks_flamegraph_html'),
142+
'error': row.get('error'),
143+
}
144+
)
145+
146+
147+
def _print_summary(
148+
rows: list[dict],
149+
ontology_id: str,
150+
source_file: Path,
151+
cache_dir: str,
152+
repetitions: int,
153+
outdir: Path,
154+
) -> None:
155+
print(f'ontology_id: {ontology_id}')
156+
print(f'source_file: {source_file}')
157+
print(f'cache_dir: {cache_dir}')
158+
print(f'repetitions: {repetitions}')
159+
print(f'artifacts_dir: {outdir}')
160+
print()
161+
print('backend | successful_runs | peak_memory_mb_median | peak_memory_mb_samples')
162+
163+
for backend in BACKENDS:
164+
backend_rows = _backend_rows(rows, backend)
165+
samples = [
166+
row['peak_memory_mb']
167+
for row in backend_rows
168+
if row['status'] == 'ok' and row.get('peak_memory_mb') is not None
169+
]
170+
if samples:
171+
median_mb = statistics.median(samples)
172+
sample_str = '[' + ', '.join(f'{v:.3f}' for v in samples) + ']'
173+
print(
174+
f'{backend} | {len(samples)} | {median_mb:.3f} | {sample_str}'
175+
)
176+
else:
177+
print(f'{backend} | 0 | NA | NA')
178+
179+
180+
def main() -> None:
181+
parser = argparse.ArgumentParser(
182+
description='Memray benchmark for CHEBI load memory per backend.'
183+
)
184+
parser.add_argument(
185+
'--backend',
186+
choices=(*BACKENDS, 'both'),
187+
default='both',
188+
help='Backend to benchmark (default: both).',
189+
)
190+
parser.add_argument(
191+
'--ontology-id',
192+
default=DEFAULT_ONTOLOGY_ID,
193+
help='Ontology ID to prime and locate locally (default: chebi).',
194+
)
195+
parser.add_argument(
196+
'--source-file',
197+
default=None,
198+
help='Local ontology file path to use for all runs.',
199+
)
200+
parser.add_argument(
201+
'--cache-dir',
202+
default=DEFAULT_CACHE_DIR,
203+
help='Cache directory (default: ./data/out).',
204+
)
205+
parser.add_argument(
206+
'--repetitions',
207+
type=int,
208+
default=DEFAULT_REPETITIONS,
209+
help='Repetitions per backend (default: 5).',
210+
)
211+
parser.add_argument(
212+
'--include-obsolete',
213+
action='store_true',
214+
help='Pass include_obsolete=True to client.load(...).',
215+
)
216+
parser.add_argument(
217+
'--results-dir',
218+
default=DEFAULT_RESULTS_DIR,
219+
help='Base directory for outputs (default: ./benchmark/results).',
220+
)
221+
parser.add_argument(
222+
'--worker',
223+
action='store_true',
224+
help=argparse.SUPPRESS,
225+
)
226+
args = parser.parse_args()
227+
228+
if args.worker:
229+
_worker_load(
230+
backend=args.backend,
231+
source=args.source_file,
232+
cache_dir=args.cache_dir,
233+
include_obsolete=args.include_obsolete,
234+
)
235+
return
236+
237+
_ensure_memray_installed()
238+
239+
# Prime cache and resolve a local file to avoid network/download noise.
240+
if args.source_file:
241+
source_file = Path(args.source_file)
242+
if not source_file.exists():
243+
raise FileNotFoundError(f'--source-file not found: {source_file}')
244+
else:
245+
_prime_cache(cache_dir=args.cache_dir, ontology_id=args.ontology_id)
246+
source_file = _resolve_local_source(
247+
cache_dir=args.cache_dir, ontology_id=args.ontology_id
248+
)
249+
250+
selected_backends = BACKENDS if args.backend == 'both' else (args.backend,)
251+
252+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
253+
outdir = Path(args.results_dir) / f'memory_memray_{timestamp}'
254+
outdir.mkdir(parents=True, exist_ok=True)
255+
256+
script_path = Path(__file__).resolve()
257+
rows: list[dict] = []
258+
259+
for backend in selected_backends:
260+
for run in range(1, args.repetitions + 1):
261+
trace_file = outdir / f'{backend}_run{run}.bin'
262+
flamegraph_html = outdir / f'{backend}_run{run}_flamegraph.html'
263+
leaks_flamegraph_html = (
264+
outdir / f'{backend}_run{run}_flamegraph_leaks.html'
265+
)
266+
267+
row = {
268+
'backend': backend,
269+
'run': run,
270+
'status': 'ok',
271+
'peak_memory_mb': None,
272+
'trace_file': str(trace_file),
273+
'flamegraph_html': str(flamegraph_html),
274+
'leaks_flamegraph_html': str(leaks_flamegraph_html),
275+
'error': None,
276+
}
277+
278+
try:
279+
run_cmd = [
280+
'memray',
281+
'run',
282+
'-o',
283+
str(trace_file),
284+
str(script_path),
285+
'--worker',
286+
'--backend',
287+
backend,
288+
'--source-file',
289+
str(source_file),
290+
'--cache-dir',
291+
args.cache_dir,
292+
]
293+
if args.include_obsolete:
294+
run_cmd.append('--include-obsolete')
295+
_run_command(run_cmd)
296+
297+
stats_cmd = ['memray', 'stats', str(trace_file)]
298+
stats_out = _run_command(stats_cmd).stdout
299+
row['peak_memory_mb'] = _parse_peak_memory_mb(stats_out)
300+
301+
flame_cmd = [
302+
'memray',
303+
'flamegraph',
304+
'-o',
305+
str(flamegraph_html),
306+
str(trace_file),
307+
]
308+
_run_command(flame_cmd)
309+
310+
leaks_flame_cmd = [
311+
'memray',
312+
'flamegraph',
313+
'--leaks',
314+
'-o',
315+
str(leaks_flamegraph_html),
316+
str(trace_file),
317+
]
318+
_run_command(leaks_flame_cmd)
319+
320+
except subprocess.CalledProcessError as exc:
321+
row['status'] = 'error'
322+
row['error'] = (
323+
f'Command failed: {" ".join(exc.cmd)}\n'
324+
f'stdout:\n{exc.stdout}\n'
325+
f'stderr:\n{exc.stderr}'
326+
)
327+
except Exception as exc: # noqa: BLE001
328+
row['status'] = 'error'
329+
row['error'] = str(exc)
330+
331+
rows.append(row)
332+
333+
_write_artifacts(rows=rows, outdir=outdir)
334+
_print_summary(
335+
rows=rows,
336+
ontology_id=args.ontology_id,
337+
source_file=source_file,
338+
cache_dir=args.cache_dir,
339+
repetitions=args.repetitions,
340+
outdir=outdir,
341+
)
342+
343+
344+
if __name__ == '__main__':
345+
main()

0 commit comments

Comments
 (0)