Skip to content

Commit 157a417

Browse files
kiritigowdaclaude
andcommitted
Replace regression/change% with speedup and add verification status to comparison
- Replace confusing regression/improvement/change% with speedup ratio (impl B throughput / impl A throughput, >1.00 means B is faster) - Add Verified column (PASS/FAIL) for each implementation - Include unsupported/unverified benchmarks in results instead of silently skipping them (shown as N/A or FAIL) - Summary now shows verification counts instead of regression counts - Update both C++ and Python comparison implementations Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
1 parent e6f2e36 commit 157a417

2 files changed

Lines changed: 228 additions & 187 deletions

File tree

scripts/compare_reports.py

Lines changed: 109 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -189,99 +189,105 @@ def write_markdown(impl_names, result_maps, all_keys, output_path, reports, syst
189189
f.write(f'| {display} | {a_val:.2f} | {b_val:.2f} | {sign}{change:.1f} |\n')
190190
f.write('\n')
191191

192-
# --- Summary ---
193-
regressions = 0
194-
improvements = 0
195-
same_count = 0
196-
cat_regressions = {}
197-
cat_improvements = {}
198-
192+
# --- Build comparison rows (include all results, not just verified) ---
199193
comparison_rows = []
200194
for key in all_keys:
201195
name, mode, resolution = key
202196
r_a = result_maps[0].get(key)
203197
r_b = result_maps[1].get(key)
204198

205-
if not (r_a and r_b):
206-
continue
207-
if not (r_a.get('supported', False) and r_b.get('supported', False)):
208-
continue
209-
if not (r_a.get('verified', True) and r_b.get('verified', True)):
199+
if not r_a and not r_b:
210200
continue
211201

212-
wc_a = r_a.get('wall_clock', {})
213-
wc_b = r_b.get('wall_clock', {})
214-
median_a = wc_a.get('median_ms', 0)
215-
median_b = wc_b.get('median_ms', 0)
216-
if median_a <= 0 or median_b <= 0:
217-
continue
218-
219-
mps_a = r_a.get('megapixels_per_sec', 0)
220-
mps_b = r_b.get('megapixels_per_sec', 0)
221-
cv_a = wc_a.get('cv_percent', 0)
222-
cv_b = wc_b.get('cv_percent', 0)
223-
category = r_a.get('category', '')
224-
change_pct = ((median_b - median_a) / median_a) * 100
225-
226-
if change_pct > 5.0:
227-
status = 'REGRESSION'
228-
regressions += 1
229-
cat_regressions[category] = cat_regressions.get(category, 0) + 1
230-
elif change_pct < -5.0:
231-
status = 'IMPROVEMENT'
232-
improvements += 1
233-
cat_improvements[category] = cat_improvements.get(category, 0) + 1
202+
row = {'name': name, 'mode': mode, 'resolution': resolution}
203+
204+
for side, r in [('a', r_a), ('b', r_b)]:
205+
if r:
206+
row[f'supported_{side}'] = r.get('supported', False)
207+
row[f'verified_{side}'] = r.get('verified', True) if r.get('supported', False) else False
208+
wc = r.get('wall_clock', {})
209+
row[f'median_{side}'] = wc.get('median_ms', 0)
210+
row[f'mps_{side}'] = r.get('megapixels_per_sec', 0)
211+
row[f'cv_{side}'] = wc.get('cv_percent', 0)
212+
row[f'category'] = r.get('category', '')
213+
else:
214+
row[f'supported_{side}'] = False
215+
row[f'verified_{side}'] = False
216+
row[f'median_{side}'] = 0
217+
row[f'mps_{side}'] = 0
218+
row[f'cv_{side}'] = 0
219+
220+
if (row['median_a'] > 0 and row['median_b'] > 0
221+
and row['verified_a'] and row['verified_b']):
222+
row['speedup'] = row['mps_b'] / row['mps_a'] if row['mps_a'] > 0 else 0
234223
else:
235-
status = 'same'
236-
same_count += 1
224+
row['speedup'] = 0
225+
226+
comparison_rows.append(row)
237227

238-
comparison_rows.append({
239-
'name': name, 'category': category, 'mode': mode, 'resolution': resolution,
240-
'median_a': median_a, 'median_b': median_b,
241-
'mps_a': mps_a, 'mps_b': mps_b,
242-
'cv_a': cv_a, 'cv_b': cv_b,
243-
'change_pct': change_pct, 'status': status
244-
})
228+
comparison_rows.sort(key=lambda r: r.get('speedup', 0))
245229

246-
comparison_rows.sort(key=lambda r: r['change_pct'], reverse=True)
230+
# --- Summary ---
231+
both_verified = sum(1 for r in comparison_rows if r['verified_a'] and r['verified_b'])
232+
a_only = sum(1 for r in comparison_rows if r['verified_a'] and not r['verified_b'])
233+
b_only = sum(1 for r in comparison_rows if not r['verified_a'] and r['verified_b'])
234+
235+
keys_a = set(result_maps[0].keys())
236+
keys_b = set(result_maps[1].keys())
237+
only_a_keys = sorted(keys_a - keys_b)
238+
only_b_keys = sorted(keys_b - keys_a)
247239

248240
f.write('## Summary\n\n')
249241
f.write('| Metric | Count |\n')
250242
f.write('|:---|---:|\n')
251-
f.write(f'| Total compared | {len(comparison_rows)} |\n')
252-
f.write(f'| Regressions (>5% slower) | {regressions} |\n')
253-
f.write(f'| Improvements (>5% faster) | {improvements} |\n')
254-
f.write(f'| Unchanged | {same_count} |\n\n')
255-
256-
if cat_regressions or cat_improvements:
257-
f.write('### By Category\n\n')
258-
f.write('| Category | Regressions | Improvements |\n')
259-
f.write('|:---|---:|---:|\n')
260-
all_summary_cats = sorted(set(list(cat_regressions.keys()) + list(cat_improvements.keys())))
261-
for cat in all_summary_cats:
262-
reg = cat_regressions.get(cat, 0)
263-
imp = cat_improvements.get(cat, 0)
264-
f.write(f'| {cat} | {reg} | {imp} |\n')
265-
f.write('\n')
243+
f.write(f'| Total benchmarks compared | {len(comparison_rows)} |\n')
244+
f.write(f'| Both verified | {both_verified} |\n')
245+
if a_only > 0:
246+
f.write(f'| Verified only in {impl_names[0]} | {a_only} |\n')
247+
if b_only > 0:
248+
f.write(f'| Verified only in {impl_names[1]} | {b_only} |\n')
249+
if only_a_keys:
250+
f.write(f'| Only in {impl_names[0]} | {len(only_a_keys)} |\n')
251+
if only_b_keys:
252+
f.write(f'| Only in {impl_names[1]} | {len(only_b_keys)} |\n')
253+
f.write('\n')
266254

267255
# --- Detailed Results ---
268256
f.write('## Detailed Comparison\n\n')
269-
f.write(f'> Change % is based on median latency. Positive = slower (regression), negative = faster (improvement).\n\n')
270-
f.write(f'| Benchmark | Mode | Resolution | {impl_names[0]} (ms) | {impl_names[0]} (MP/s) | '
271-
f'{impl_names[1]} (ms) | {impl_names[1]} (MP/s) | Change % | Status |\n')
272-
f.write('|:---|:---|:---|---:|---:|---:|---:|---:|:---|\n')
257+
f.write(f'> Speedup = {impl_names[1]} throughput / {impl_names[0]} throughput. '
258+
f'Values >1.00 mean {impl_names[1]} is faster.\n\n')
259+
f.write(f'| Benchmark | Mode | Resolution '
260+
f'| {impl_names[0]} (ms) | {impl_names[0]} (MP/s) | {impl_names[0]} Verified '
261+
f'| {impl_names[1]} (ms) | {impl_names[1]} (MP/s) | {impl_names[1]} Verified '
262+
f'| Speedup |\n')
263+
f.write('|:---|:---|:---|---:|---:|:---:|---:|---:|:---:|---:|\n')
273264

274265
has_unstable = False
275266
for row in comparison_rows:
276267
flag = ''
277268
if row['cv_a'] > 15 or row['cv_b'] > 15:
278269
flag = ' *'
279270
has_unstable = True
280-
sign = '+' if row['change_pct'] >= 0 else ''
281-
f.write(f'| {row["name"]} | {row["mode"]} | {row["resolution"]} '
282-
f'| {row["median_a"]:.3f} | {row["mps_a"]:.1f} '
283-
f'| {row["median_b"]:.3f} | {row["mps_b"]:.1f} '
284-
f'| {sign}{row["change_pct"]:.1f} | {row["status"]}{flag} |\n')
271+
272+
f.write(f'| {row["name"]} | {row["mode"]} | {row["resolution"]} | ')
273+
274+
if not row['supported_a']:
275+
f.write('N/A | N/A | N/A | ')
276+
else:
277+
v = 'PASS' if row['verified_a'] else 'FAIL'
278+
f.write(f'{row["median_a"]:.3f} | {row["mps_a"]:.1f} | {v} | ')
279+
280+
if not row['supported_b']:
281+
f.write('N/A | N/A | N/A | ')
282+
else:
283+
v = 'PASS' if row['verified_b'] else 'FAIL'
284+
f.write(f'{row["median_b"]:.3f} | {row["mps_b"]:.1f} | {v} | ')
285+
286+
if row['speedup'] > 0:
287+
f.write(f'{row["speedup"]:.2f}x{flag}')
288+
else:
289+
f.write('N/A')
290+
f.write(' |\n')
285291
f.write('\n')
286292

287293
if has_unstable:
@@ -330,46 +336,49 @@ def write_markdown(impl_names, result_maps, all_keys, output_path, reports, syst
330336
def write_csv(impl_names, result_maps, all_keys, output_path, reports):
331337
with open(output_path + '.csv', 'w') as f:
332338
header = f'benchmark,category,mode,resolution'
333-
header += f',{impl_names[0]}_median_ms,{impl_names[0]}_mp_per_sec'
334-
header += f',{impl_names[1]}_median_ms,{impl_names[1]}_mp_per_sec'
335-
header += ',change_percent,status'
339+
header += f',{impl_names[0]}_median_ms,{impl_names[0]}_mp_per_sec,{impl_names[0]}_verified'
340+
header += f',{impl_names[1]}_median_ms,{impl_names[1]}_mp_per_sec,{impl_names[1]}_verified'
341+
header += ',speedup'
336342
f.write(header + '\n')
337343

338-
for key in all_keys:
344+
for key in sorted(all_keys):
339345
name, mode, resolution = key
340346
r_a = result_maps[0].get(key)
341347
r_b = result_maps[1].get(key)
342348

343-
if not (r_a and r_b):
349+
if not r_a and not r_b:
344350
continue
345-
if not (r_a.get('supported', False) and r_b.get('supported', False)):
346-
continue
347-
if not (r_a.get('verified', True) and r_b.get('verified', True)):
348-
continue
349-
350-
wc_a = r_a.get('wall_clock', {})
351-
wc_b = r_b.get('wall_clock', {})
352-
median_a = wc_a.get('median_ms', 0)
353-
median_b = wc_b.get('median_ms', 0)
354-
if median_a <= 0 or median_b <= 0:
355-
continue
356-
357-
mps_a = r_a.get('megapixels_per_sec', 0)
358-
mps_b = r_b.get('megapixels_per_sec', 0)
359-
category = r_a.get('category', '')
360-
change_pct = ((median_b - median_a) / median_a) * 100
361-
362-
if change_pct > 5.0:
363-
status = 'REGRESSION'
364-
elif change_pct < -5.0:
365-
status = 'IMPROVEMENT'
366-
else:
367-
status = 'same'
368351

369-
f.write(f'{name},{category},{mode},{resolution},'
370-
f'{median_a:.4f},{mps_a:.2f},'
371-
f'{median_b:.4f},{mps_b:.2f},'
372-
f'{change_pct:.2f},{status}\n')
352+
category = ''
353+
cols_a = ',,'
354+
cols_b = ',,'
355+
verified_a = False
356+
verified_b = False
357+
mps_a = 0
358+
mps_b = 0
359+
360+
if r_a and r_a.get('supported', False):
361+
category = r_a.get('category', '')
362+
wc = r_a.get('wall_clock', {})
363+
median = wc.get('median_ms', 0)
364+
mps_a = r_a.get('megapixels_per_sec', 0)
365+
verified_a = r_a.get('verified', True)
366+
cols_a = f'{median:.4f},{mps_a:.2f},{"PASS" if verified_a else "FAIL"}'
367+
368+
if r_b and r_b.get('supported', False):
369+
if not category:
370+
category = r_b.get('category', '')
371+
wc = r_b.get('wall_clock', {})
372+
median = wc.get('median_ms', 0)
373+
mps_b = r_b.get('megapixels_per_sec', 0)
374+
verified_b = r_b.get('verified', True)
375+
cols_b = f'{median:.4f},{mps_b:.2f},{"PASS" if verified_b else "FAIL"}'
376+
377+
speedup = ''
378+
if verified_a and verified_b and mps_a > 0:
379+
speedup = f'{mps_b / mps_a:.4f}'
380+
381+
f.write(f'{name},{category},{mode},{resolution},{cols_a},{cols_b},{speedup}\n')
373382

374383
print(f' Comparison CSV: {output_path}.csv')
375384

0 commit comments

Comments
 (0)