sb-scripts/generate_report.py at main · WenqingLan1/sb-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
#!/usr/bin/env python3
"""
generate_report.py
Compare two SuperBench markdown result files and generate an HTML report.

Supports all SuperBench benchmark types including:
- Model benchmarks (BERT, GPT, LSTM, ResNet, VGG, DenseNet, LLaMA)
- MICRO1 benchmarks (CUBLAS, cuBLASLt, cuDNN, GEMM-FLOPS including fp64/int8, kernel launch)
- MICRO2 benchmarks (CPU memory, GPU burn, NCCL bandwidth, matmul, sharding)
- Memory bandwidth tests (DTOD, GPUMEM, DTOH/HTOD via SM/DMA)
- GPU-STREAM, nvbandwidth, cpu-stream benchmarks
- IB, DISK benchmarks

Usage:
    python generate_report.py <baseline.md> <compare.md> [-o report.html] [--include-tables] [--baseline-label LABEL] [--compare-label LABEL]

Examples:
    # Basic usage with file paths as labels
    python generate_report.py baseline.md compare.md -o report.html

    # Custom labels for cleaner display
    python generate_report.py gb200.md gb300.md -o report.html --baseline-label "GB200" --compare-label "GB300"
"""
import argparse
import sys
import json
from pathlib import Path
import html
import plotly.graph_objects as go
import plotly.io as pio
import re
from collections import defaultdict
from statistics import mean

# Load unit specifications from data spec file
DATA_SPEC = {}

def load_data_spec(spec_file_path=None):
    """Load data spec from specified path or default location."""
    global DATA_SPEC
    try:
        if spec_file_path:
            spec_path = Path(spec_file_path)
        else:
            spec_path = Path(__file__).parent / "superbench_data_spec.json"

        if spec_path.exists():
            with open(spec_path, 'r') as f:
                DATA_SPEC = json.load(f)
        else:
            print(f"Warning: Data spec file not found: {spec_path}", file=sys.stderr)
    except Exception as e:
        print(f"Warning: Could not load data spec: {e}", file=sys.stderr)

# Edit this list to ignore metrics containing any of these tokens (case-insensitive).
# Example: 'correctness' will ignore 'gpu-copy-bw:correctness' and similar metrics.
# Note: Correctness tests and gpu-burn are typically pass/fail and may not be meaningful for performance comparison.
IGNORE_TOKENS = [
    "correctness",
    "gpu-burn",
    "lstm"
]

def normalize_gpu_metric_name(metric: str) -> str:
    """
    Normalize GPU metrics by removing GPU-specific numbering to enable averaging across GPUs.
    E.g., 'gpu-stream:perf/STREAM_ADD_double_gpu_0_buffer_4294967296_block_1024_bw'
    becomes 'gpu-stream:perf/STREAM_ADD_double_buffer_4294967296_block_1024_bw'
    Also handles 'gpu-burn/gpu_0_pass' -> 'gpu-burn/gpu_pass'
    """
    # Pattern to match gpu_<number> and remove it, handling both middle and end positions
    normalized = re.sub(r'_gpu_\d+_', '_', metric)  # Middle: _gpu_0_ -> _
    normalized = re.sub(r'_gpu_\d+$', '', normalized)  # End: _gpu_0 -> ''
    normalized = re.sub(r'/gpu_\d+_', '/', normalized)  # After slash: /gpu_0_ -> /
    normalized = re.sub(r'gpu_\d+_', '', normalized)  # Start: gpu_0_ -> ''
    return normalized

def aggregate_gpu_metrics(section_data: dict) -> dict:
    """
    Aggregate metrics across multiple GPUs by averaging values for metrics that differ only by GPU number.
    """
    # Group metrics by their normalized names
    metric_groups = defaultdict(list)

    for metric, stats_dict in section_data.items():
        if 'gpu_' in metric and ('gpu-stream' in metric or 'gpu-burn' in metric):
            normalized_name = normalize_gpu_metric_name(metric)
            metric_groups[normalized_name].append(stats_dict)
        else:
            # Keep non-GPU metrics as-is
            metric_groups[metric].append(stats_dict)

    # Average the grouped metrics
    aggregated = {}
    for normalized_metric, stats_list in metric_groups.items():
        if len(stats_list) > 1:
            # Multiple GPU metrics to average
            aggregated[normalized_metric] = {}
            # Average mean values if available
            mean_values = [s.get('mean') for s in stats_list if s.get('mean') is not None]
            if mean_values:
                aggregated[normalized_metric]['mean'] = mean(mean_values)

            # Average std values if available (note: this is a simplification)
            std_values = [s.get('std') for s in stats_list if s.get('std') is not None]
            if std_values:
                aggregated[normalized_metric]['std'] = mean(std_values)
        else:
            # Single value, keep as-is
            aggregated[normalized_metric] = stats_list[0]

    return aggregated

def parse_file(filepath: Path):
    """
    Parse a markdown-style results file into {section: {metric: {'mean': value, 'std': value}}}.
    Preserves insertion order of sections and metrics.
    """
  # Notes:
  # - The parser is intentionally permissive because different summary files
  #   may format tables differently (e.g., extra pipes, missing headers).
  # - We look for common 'mean' column labels and fall back to simple
  #   two-column rows when possible. Non-numeric cells are ignored.
    data = {}
    current = None
    if not filepath.exists():
        return data
    with filepath.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith("## "):
                current = line[3:].strip()
                data[current] = {}
                continue
            if line.startswith("# "):
                current = line[2:].strip()
                data.setdefault(current, {})
                continue

            if "|" in line:
                parts = [p.strip() for p in line.split("|")]
                parts = [p for p in parts if p != ""]
                if len(parts) < 2:
                    continue

                if current is None:
                    current = "Global"
                    data.setdefault(current, {})

                metric = parts[0]
                if len(parts) >= 3:
                    stat_type = parts[1].lower()
                    try:
                        value = float(parts[2])
                        # Initialize metric dict if not exists
                        if metric not in data[current]:
                            data[current][metric] = {}

                        # Store the statistic
                        if stat_type in ("mean", "avg", "average"):
                            data[current][metric]['mean'] = value
                        elif stat_type in ("std", "stddev", "standard_deviation"):
                            data[current][metric]['std'] = value
                        elif stat_type in ("min", "minimum"):
                            data[current][metric]['min'] = value
                        elif stat_type in ("max", "maximum"):
                            data[current][metric]['max'] = value
                    except Exception:
                        pass
                    continue

                # Last resort: metric,value (2 columns) - treat as mean
                if len(parts) == 2:
                    try:
                        value = float(parts[1])
                        if metric not in data[current]:
                            data[current][metric] = {}
                        data[current][metric]['mean'] = value
                    except Exception:
                        pass

    # Aggregate GPU metrics across multiple GPUs
    for section in data:
        data[section] = aggregate_gpu_metrics(data[section])

    return data

def group_key(metric: str) -> str:
    """
    Group by the leading token of the metric, stripping trailing sizes/variants.
    Enhanced to handle new benchmark types including fp64 and int8 GEMM operations.
    Special grouping for model benchmarks: VGG/DenseNet vs other models.
    """
    # Normalize input to a string and trim whitespace.
    # We aim to extract a stable "group" prefix for related metrics.
    m = str(metric).strip()

    # Special handling for model benchmarks - separate VGG/DenseNet from others
    m_lower = m.lower()
    if any(model in m_lower for model in ['vgg', 'densenet']):
        return 'models-group1'
    elif any(model in m_lower for model in ['bert', 'gpt', 'resnet', 'lstm', 'llama', 'model']):
        return 'models-group2'

    # Enhanced explicit prefixes mapping for better grouping
    PREFIX_MAP = {
        # Group NCCL bandwidth tests by operation type
        'nccl-bw:nvlink-allgather': 'nccl-allgather',
        'nccl-bw:nvlink-alltoall': 'nccl-alltoall',
        'nccl-bw:nvlink-broadcast': 'nccl-broadcast',
        'nccl-bw:nvlink-reduce': 'nccl-reduce',
        'nccl-bw:nvlink-reducescatter': 'nccl-reducescatter',
        'nccl-bw:nvlink': 'nccl-allreduce',  # allreduce is the base nvlink test
        # Group GPU copy bandwidth tests by direction and method
        'gpu-copy-bw:correctness': 'gpu-copy-correctness',
        'gpu-copy-bw:perf': 'gpu-copy-perf',
        # Group GPU stream tests
        'gpu-stream:perf': 'gpu-stream',
        # Group CPU stream tests by socket
        'cpu-stream:cross-socket0': 'cpu-stream-socket0',
        'cpu-stream:cross-socket1': 'cpu-stream-socket1',
    }

    # Check for exact matches first
    if m in PREFIX_MAP:
        return PREFIX_MAP[m]

    # Check for prefix matches in order of specificity
    for prefix, group in sorted(PREFIX_MAP.items(), key=len, reverse=True):
        if m.startswith(prefix):
            return group

    # Capture a leading token that starts with a letter and continues with
    # letters, digits, underscores or dashes. This catches common metric
    # identifiers like 'gpu-stream', 'nvbandwidth', 'gemm-flops', etc.
    mo = re.match(r'^([A-Za-z][A-Za-z0-9_\-]*)', m)
    if mo:
        return mo.group(1)
    # fallback to previous separators logic
    if ":" in m:
        return m.split(":", 1)[0]
    if "@" in m:
        return m.split("@", 1)[0]
    if "-" in m:
        return m.split("-", 1)[0]
    return m

def get_metric_unit(metric: str) -> str:
    """
    Extract unit from metric name using the data spec file.
    Returns empty string if no unit found.
    """
    if not DATA_SPEC:
        return ""

    # Special cases for common patterns
    if metric.endswith('_bw') or 'busbw' in metric.lower():
        # Most bandwidth metrics are GB/s
        if 'nccl' in metric.lower():
            return " (GB/s)"
        elif 'gpu-stream' in metric.lower():
            return " (GB/s)"
        elif 'gpu-copy' in metric.lower():
            return " (GB/s)"

    # GEMM-FLOPS metrics - hardcode unified unit since GFLOPS and GIOPS are essentially the same
    if 'gemm-flops' in metric.lower() and (metric.endswith('_flops') or metric.endswith('_iops')):
        return " (GFLOPS/GIOPS)"

    # Check all unit suffixes across all benchmark types
    for benchmark_type, units in DATA_SPEC.items():
        if isinstance(units, dict):
            for unit_suffix, unit_info in units.items():
                # Simple endswith check for unit suffixes
                if metric.endswith(f"_{unit_suffix}"):
                    if isinstance(unit_info, dict) and "unit" in unit_info:
                        return f" ({unit_info['unit']})"

    return ""

def should_ignore_metric(metric: str) -> bool:
    if not IGNORE_TOKENS:
        return False
    m = metric.lower()
    for tok in IGNORE_TOKENS:
        if tok.lower() in m:
            return True

    # Filter out ratio metrics from gpu-stream (keep only bandwidth)
    if 'gpu-stream' in m and '_ratio' in m:
        return True

    # Special filtering for cpu-memory-bw-latency - only keep cross-NUMA bandwidth
    if 'cpu-memory-bw-latency' in m and 'mem_bandwidth_matrix' in m:
        # Only keep numa 0->1 and 1->0 bandwidth measurements
        if 'numa_0_1_bw' in m or 'numa_1_0_bw' in m:
            return False  # Keep these
        else:
            return True   # Ignore all other cpu-memory-bw-latency metrics

    return False

def is_nan_or_none(v):
    """Check if value is None or NaN (single node case)."""
    if v is None:
        return True
    try:
        import math
        return math.isnan(float(v))
    except (ValueError, TypeError):
        return True

def std_to_percentage(mean_val, std_val):
    """Convert standard deviation to percentage of mean (coefficient of variation)."""
    if is_nan_or_none(mean_val) or is_nan_or_none(std_val) or mean_val == 0:
        return None
    try:
        return (float(std_val) / abs(float(mean_val))) * 100
    except (ValueError, TypeError, ZeroDivisionError):
        return None

def fmt(v, metric_name=""):
    if v is None:
        return ""
    try:
        # pretty-print integers without decimals, floats with 2 decimals
        if float(v).is_integer():
            return str(int(v))
        # Special case: kernel-launch needs 4 decimal places for precision
        if "kernel-launch" in metric_name.lower():
            return f"{float(v):,.4f}"
        return f"{float(v):,.2f}"
    except Exception:
        return str(v)


MAX_WRAP_CHUNKS = 10

def wrap_chunks(s: str, width: int = 50, max_chunks: int = MAX_WRAP_CHUNKS):
    """
    Break a long string into a list of chunk strings suitable for putting into
    `customdata` so `hovertemplate` can compose them with `<br>` between chunks.

    Behaviour summary:
    - Prefer splitting on whitespace to preserve whole words when possible.
    - If a single word/token exceeds `width`, break that token into
      width-sized slices so extremely long identifiers are still wrapped.
    - Always return exactly `max_chunks` entries (pad with empty strings)
      so `customdata` rows have a stable shape.
    """
    if s is None:
        return [""] * max_chunks
    text = str(s)
    parts = []
    cur = ""
    # Split while preserving whitespace groups so we can rebuild lines
    for token in re.split(r'(\s+)', text):
        if not token:
            continue
        tok = token.strip()
        if not tok:
            continue
        # If token itself is longer than width, flush any current buffer
        # then slice the long token into width-sized pieces.
        if len(tok) > width:
            if cur:
                parts.append(cur.strip())
                cur = ""
            t = tok
            for i in range(0, len(t), width):
                parts.append(t[i:i+width])
            continue
        # Otherwise attempt to append to the current line; if it would exceed
        # width, push current and start a new one.
        if len((cur + ' ' + tok).strip()) > width and cur:
            parts.append(cur.strip())
            cur = tok
        else:
            cur = (cur + ' ' + tok).strip()
    if cur:
        parts.append(cur.strip())
    # Escape HTML and normalize to fixed length
    parts = [html.escape(p) for p in parts][:max_chunks]
    if len(parts) < max_chunks:
        parts.extend([""] * (max_chunks - len(parts)))
    return parts


def wrap_text_single(s: str, width: int = 50, max_chunks: int = MAX_WRAP_CHUNKS) -> str:
    """
    Return a single HTML string with <br> between non-empty chunks.
    This avoids emitting empty lines when fewer chunks are used.
    """
    parts = wrap_chunks(s, width=width, max_chunks=max_chunks)
    # Remove empty parts and join with <br> so the hover shows no blank lines.
    parts = [p for p in parts if p]
    if not parts:
        return ""
    return "<br>".join(parts)

def build_table_html(metrics, baseline_vals, compare_vals, percent_diffs, baseline_stds=None, compare_stds=None, baseline_label="Baseline", compare_label="Compare"):
    """
    Build a small HTML table for a group.
    """
    rows = []
    rows.append("<table border='1' cellpadding='6' cellspacing='0'>")
    has_stds = baseline_stds is not None and compare_stds is not None

    if has_stds:
        rows.append(f"<thead><tr><th>Metric</th><th>{baseline_label} (±std%)</th><th>{compare_label} (±std%)</th><th>% diff</th></tr></thead>")
    else:
        rows.append(f"<thead><tr><th>Metric</th><th>{baseline_label}</th><th>{compare_label}</th><th>% diff</th></tr></thead>")

    rows.append("<tbody>")

    if has_stds:
        for m, b, c, pd, b_std, c_std in zip(metrics, baseline_vals, compare_vals, percent_diffs, baseline_stds, compare_stds):
            mb = html.escape(m)
            # Convert std to percentage
            b_std_pct = std_to_percentage(b, b_std)
            c_std_pct = std_to_percentage(c, c_std)
            bb = f"{fmt(b, m)} ±{b_std_pct:.1f}%" if b is not None and b_std_pct is not None else fmt(b, m)
            cc = f"{fmt(c, m)} ±{c_std_pct:.1f}%" if c is not None and c_std_pct is not None else fmt(c, m)
            pd_s = "" if pd is None else f"{pd:.2f}%"
            rows.append(f"<tr><td>{mb}</td><td style='text-align:right'>{bb}</td><td style='text-align:right'>{cc}</td><td style='text-align:right'>{pd_s}</td></tr>")
    else:
        for m, b, c, pd in zip(metrics, baseline_vals, compare_vals, percent_diffs):
            mb = html.escape(m)
            bb = html.escape(fmt(b, m))
            cc = html.escape(fmt(c, m))
            pd_s = "" if pd is None else f"{pd:.2f}%"
            rows.append(f"<tr><td>{mb}</td><td style='text-align:right'>{bb}</td><td style='text-align:right'>{cc}</td><td style='text-align:right'>{pd_s}</td></tr>")

    rows.append("</tbody></table>")
    return "\n".join(rows)

def build_report(baseline_path, compare_path, out_path, include_tables=False, baseline_label=None, compare_label=None, data_spec=None):
    # Load data spec if provided
    load_data_spec(data_spec)

    baseline_data = parse_file(Path(baseline_path))
    compare_data = parse_file(Path(compare_path))
    label_baseline = baseline_label if baseline_label else baseline_path
    label_compare = compare_label if compare_label else compare_path

    # Section ordering: baseline sections first, then compare-only sections
    sections = []
    for s in baseline_data.keys():
        if s not in sections:
            sections.append(s)
    for s in compare_data.keys():
        if s not in sections:
            sections.append(s)

    html_snippets = []
    plot_pairs = []
    id_counter = 0

    for sec in sections:
        # Determine metric order: baseline metrics then compare-only
        baseline_metrics = [m for m in list(baseline_data.get(sec, {}).keys()) if not should_ignore_metric(m)]
        compare_metrics = [m for m in list(compare_data.get(sec, {}).keys()) if not should_ignore_metric(m)]
        metrics = []
        for m in baseline_metrics:
            if m not in metrics:
                metrics.append(m)
        for m in compare_metrics:
            if m not in metrics:
                metrics.append(m)
        if not metrics:
            continue

        # Group metrics by group_key preserving order
        groups = []
        group_map = {}
        for m in metrics:
            g = group_key(m)
            if g not in group_map:
                group_map[g] = []
                groups.append(g)
            group_map[g].append(m)

        html_snippets.append(f"<h2>{html.escape(sec)}</h2>")
        for g in groups:
            group_metrics = [m for m in group_map[g] if not should_ignore_metric(m)]
            if not group_metrics:
                continue

            # Extract mean values and standard deviations
            baseline_stats = [baseline_data.get(sec, {}).get(m, {}) for m in group_metrics]
            compare_stats = [compare_data.get(sec, {}).get(m, {}) for m in group_metrics]

            baseline_vals = [stats.get('mean') if isinstance(stats, dict) else stats for stats in baseline_stats]
            compare_vals = [stats.get('mean') if isinstance(stats, dict) else stats for stats in compare_stats]

            baseline_stds = [stats.get('std') if isinstance(stats, dict) else None for stats in baseline_stats]
            compare_stds = [stats.get('std') if isinstance(stats, dict) else None for stats in compare_stats]

            percent_diffs = []
            percent_diff_errors = []
            for b, c, b_std, c_std in zip(baseline_vals, compare_vals, baseline_stds, compare_stds):
                if b is None or c is None:
                    percent_diffs.append(None)
                    percent_diff_errors.append(None)
                else:
                    try:
                        if b != 0:
                            pct_diff = ((c - b) / b) * 100
                            percent_diffs.append(pct_diff)

                            # Error propagation for percentage difference: sqrt((σc/b)² + (c*σb/b²)²) * 100
                            if b_std is not None and c_std is not None and b != 0:
                                error = 100 * ((c_std / abs(b))**2 + (c * b_std / b**2)**2)**0.5
                                percent_diff_errors.append(error)
                            else:
                                percent_diff_errors.append(None)
                        else:
                            percent_diffs.append(None)
                            percent_diff_errors.append(None)
                    except Exception:
                        percent_diffs.append(None)
                        percent_diff_errors.append(None)

            html_snippets.append(f"<h3>Group: {html.escape(g)}</h3>")
            if include_tables:
                html_snippets.append(build_table_html(group_metrics, baseline_vals, compare_vals, percent_diffs, baseline_stds, compare_stds, label_baseline, label_compare))

            # Charts
            id_counter += 1
            # Reserve a fixed plotting area height and separate bottom margin
            # to prevent long metric names (x-axis labels) from stealing plot
            # vertical space. Values lowered to reduce overall chart height
            # while keeping label area readable on typical monitors.
            # Increase bottom margin for groups with many long metric names
            extra_margin = 40 if len(group_metrics) > 10 else 0

            # Dynamic font size and error bar thickness based on number of metrics to prevent overlap
            num_metrics = len(group_metrics)
            if num_metrics <= 5:
                font_size = 10  # Large font for few items
                error_thickness = 2
                error_width = 4
            elif num_metrics <= 10:
                font_size = 9   # Medium font
                error_thickness = 2
                error_width = 3
            elif num_metrics <= 20:
                font_size = 8   # Small font
                error_thickness = 1
                error_width = 2
            elif num_metrics <= 30:
                font_size = 7   # Smaller font
                error_thickness = 1
                error_width = 2
            else:
                font_size = 6   # Very small font for many items
                error_thickness = 1
                error_width = 1

            plot_area_height = 360
            bottom_margin_for_labels = 120 + extra_margin
            height_bar = plot_area_height + bottom_margin_for_labels

            plot_area_height_diff = 260
            bottom_margin_for_labels_diff = 100 + extra_margin
            height_diff = plot_area_height_diff + bottom_margin_for_labels_diff
            bar_id = f"plot_bar_{id_counter}"
            diff_id = f"plot_diff_{id_counter}"

            # Detect common unit from group metrics for y-axis label
            common_unit = ""
            if group_metrics:
                # For mixed groups, prioritize the most common unit type
                unit_counts = {}
                for m in group_metrics:
                    unit = get_metric_unit(m)
                    if unit:
                        unit_counts[unit] = unit_counts.get(unit, 0) + 1

                if unit_counts:
                    # Use the most common unit
                    common_unit = max(unit_counts, key=unit_counts.get)

            # prepare wrapped hover text and customdata rows [report_name, wrapped_html]
            wrapped_single = [wrap_text_single(m, width=50, max_chunks=MAX_WRAP_CHUNKS) for m in group_metrics]
            # Short labels for x-axis: remove everything before the first '/'
            display_labels = [m.split("/", 1)[1] if "/" in m else m for m in group_metrics]
            custom_rows_baseline = [[label_baseline, ws] for ws in wrapped_single]
            custom_rows_compare = [[label_compare, ws] for ws in wrapped_single]

            fig_bar = go.Figure()
            # Create hover text with ±std% when available
            baseline_hover_vals = []
            compare_hover_vals = []
            baseline_std_pcts = []
            compare_std_pcts = []

            for val, std, metric in zip(baseline_vals, baseline_stds, group_metrics):
                std_pct = std_to_percentage(val, std)
                baseline_std_pcts.append(std_pct)
                if val is not None:
                    if std_pct is not None:
                        baseline_hover_vals.append(f"{fmt(val, metric)} ±{std_pct:.1f}%")
                    else:
                        baseline_hover_vals.append(fmt(val, metric))
                else:
                    baseline_hover_vals.append("")

            for val, std, metric in zip(compare_vals, compare_stds, group_metrics):
                std_pct = std_to_percentage(val, std)
                compare_std_pcts.append(std_pct)
                if val is not None:
                    if std_pct is not None:
                        compare_hover_vals.append(f"{fmt(val, metric)} ±{std_pct:.1f}%")
                    else:
                        compare_hover_vals.append(fmt(val, metric))
                else:
                    compare_hover_vals.append("")

            # Convert percentage std back to absolute for error bars
            baseline_error_bars = []
            compare_error_bars = []
            for val, std_pct in zip(baseline_vals, baseline_std_pcts):
                if val is not None and std_pct is not None:
                    baseline_error_bars.append(abs(val) * std_pct / 100)
                else:
                    baseline_error_bars.append(None)  # None hides error bars completely

            for val, std_pct in zip(compare_vals, compare_std_pcts):
                if val is not None and std_pct is not None:
                    compare_error_bars.append(abs(val) * std_pct / 100)
                else:
                    compare_error_bars.append(None)  # None hides error bars completely

            fig_bar.add_trace(go.Bar(x=group_metrics, y=baseline_vals, name=label_baseline,
                                     customdata=[[custom_rows_baseline[i][0], custom_rows_baseline[i][1], baseline_hover_vals[i]] for i in range(len(group_metrics))],
                                     error_y=dict(type='data', array=baseline_error_bars, visible=True, thickness=error_thickness, width=error_width),
                                     hovertemplate="%{customdata[0]}<br>%{customdata[1]}<br><b>%{customdata[2]}</b><extra></extra>"))
            fig_bar.add_trace(go.Bar(x=group_metrics, y=compare_vals, name=label_compare,
                                     customdata=[[custom_rows_compare[i][0], custom_rows_compare[i][1], compare_hover_vals[i]] for i in range(len(group_metrics))],
                                     error_y=dict(type='data', array=compare_error_bars, visible=True, thickness=error_thickness, width=error_width),
                                     hovertemplate="%{customdata[0]}<br>%{customdata[1]}<br><b>%{customdata[2]}</b><extra></extra>"))
            # Increase chart height for better y-axis spacing/readability.
            # Place the legend horizontally above the plot so long file
            # path labels do not consume horizontal space on the right.
            fig_bar.update_traces(hoverlabel=dict(align='left', bgcolor='rgba(255,255,255,0.9)', bordercolor='black'))
            fig_bar.update_layout(
                title=f"[{sec}] {g} - Metric Comparison",
                xaxis_title="", yaxis_title=f"Mean Value{common_unit}",
                barmode="group", template="plotly_white",
                height=height_bar, hovermode="closest",
                legend=dict(orientation='h', y=1.08, x=0.01, xanchor='left'),
                margin=dict(t=120, b=bottom_margin_for_labels)
            )
            # Keep x-axis label area stable: rotate ticks and prevent pan/zoom changing
            # the x-axis label layout (fixedrange keeps labels readable during pan).
            fig_bar.update_xaxes(tickangle=45, automargin=False, fixedrange=False,
                                 title_standoff=40, tickmode='array',
                                 tickvals=group_metrics, ticktext=display_labels,
                                 tickfont=dict(size=font_size))
            fig_bar.update_traces(hoverlabel=dict(align='left'))

            diff_custom = [[f"% diff ({label_compare} vs {label_baseline})", ws] for ws in wrapped_single]
            fig_diff = go.Figure()
            # For percentage diff plot, use proper error propagation
            # Single measurements (NaN std) are treated as having zero uncertainty
            diff_error_bars = []
            for b_val, c_val, b_std_pct, c_std_pct in zip(baseline_vals, compare_vals, baseline_std_pcts, compare_std_pcts):
                if b_val is None or c_val is None or b_val == 0:
                    diff_error_bars.append(None)
                    continue

                # Treat NaN std as zero uncertainty (single measurement)
                b_std_pct_safe = b_std_pct if b_std_pct is not None else 0.0
                c_std_pct_safe = c_std_pct if c_std_pct is not None else 0.0

                # Convert percentage std back to absolute values for error propagation
                b_std_abs = abs(b_val) * b_std_pct_safe / 100
                c_std_abs = abs(c_val) * c_std_pct_safe / 100

                # Error propagation for percentage difference: sqrt((σc/b)² + (c*σb/b²)²) * 100
                try:
                    error_pct = ((c_std_abs / abs(b_val))**2 + (c_val * b_std_abs / b_val**2)**2)**0.5 * 100
                    diff_error_bars.append(error_pct)
                except (ZeroDivisionError, ValueError):
                    diff_error_bars.append(None)

            fig_diff.add_trace(go.Scatter(x=group_metrics, y=percent_diffs, mode="lines+markers",
                                           name=f"% diff ({label_compare} vs {label_baseline})",
                                           line=dict(color="orange"),
                                           customdata=diff_custom,
                                           hovertemplate="%{customdata[0]}<br>%{customdata[1]}<br><b>%{y:.2f}%</b><extra></extra>"))
            fig_diff.add_shape(type='line', x0=-0.5, x1=len(group_metrics)-0.5, y0=0, y1=0,
                               line=dict(color='gray', dash='dash'))
            # Slightly larger diff plot to match bar chart vertical space.
            # Use a top-positioned horizontal legend to avoid horizontal clutter.
            fig_diff.update_traces(hoverlabel=dict(align='left', bgcolor='rgba(255,255,255,0.9)', bordercolor='black'))
            fig_diff.update_layout(
                title=f"[{sec}] {g} - Percent Difference ({label_compare} vs {label_baseline})",
                xaxis_title="", yaxis_title="Percent Difference (%)",
                template="plotly_white", height=height_diff, hovermode="closest",
                legend=dict(orientation='h', y=1.08, x=0.01, xanchor='left'),
                margin=dict(t=120, b=bottom_margin_for_labels_diff)
            )
            fig_diff.update_xaxes(tickangle=45, automargin=False, fixedrange=False,
                                  title_standoff=36, tickmode='array',
                                  tickvals=group_metrics, ticktext=display_labels,
                                  tickfont=dict(size=font_size))
            fig_diff.update_traces(hoverlabel=dict(align='left'))

            # Export HTML fragments and register pair for JS
            bar_html = fig_bar.to_html(full_html=False, include_plotlyjs=False, div_id=bar_id)
            diff_html = fig_diff.to_html(full_html=False, include_plotlyjs=False, div_id=diff_id)
            html_snippets.append(bar_html)
            html_snippets.append(diff_html)

            plot_pairs.append({
                "bar": bar_id,
                "diff": diff_id,
                "x": group_metrics,
                "ys": [baseline_vals, compare_vals],
                "diffs": percent_diffs
            })

    # Build JS that syncs zoom/pan and triggers y-autorange on visible data
    js_sync = r"""
<script>
document.addEventListener('DOMContentLoaded', function(){
  const pairs = REPLACE_PAIRS_JSON;

  function extractXRange(relayoutData){
    if(!relayoutData) return null;
    if(relayoutData['xaxis.range']) return relayoutData['xaxis.range'];
    let x0 = null, x1 = null;
    for(const k in relayoutData){
      if(k === '_sync_source') continue;
      const m = k.match(/^xaxis\.range\[(\d)\]$/);
      if(m){
        if(m[1]==='0') x0 = relayoutData[k];
        if(m[1]==='1') x1 = relayoutData[k];
      }
    }
    if(x0 !== null && x1 !== null) return [x0, x1];
    return null;
  }

  function visibleIndexRangeFromX(xr, xArray){
    if(!xr) return null;
    let a = xr[0], b = xr[1];
    if(typeof a === 'string' && typeof b === 'string' && xArray.indexOf(a) !== -1 && xArray.indexOf(b) !== -1){
      let i0 = xArray.indexOf(a), i1 = xArray.indexOf(b);
      if(i0 > i1){ let t = i0; i0 = i1; i1 = t; }
      return {start: i0, end: i1};
    }
    let na = parseFloat(a), nb = parseFloat(b);
    if(!isNaN(na) && !isNaN(nb)){
      let i0 = Math.max(0, Math.floor(Math.min(na, nb)));
      let i1 = Math.min(xArray.length - 1, Math.ceil(Math.max(na, nb)));
      return {start: i0, end: i1};
    }
    return null;
  }

  function computeRangeForValues(arrays, idxRange){
    if(!idxRange) return null;
    let ymin = Number.POSITIVE_INFINITY, ymax = Number.NEGATIVE_INFINITY;
    let found = false;
    for(let a=0; a<arrays.length; a++){
      const arr = arrays[a] || [];
      for(let i=idxRange.start; i<=idxRange.end && i < arr.length; i++){
        const v = arr[i];
        if(v === null || v === undefined) continue;
        const num = Number(v);
        if(Number.isFinite(num)){
          found = true;
          if(num < ymin) ymin = num;
          if(num > ymax) ymax = num;
        }
      }
    }
    if(!found) return null;
    if(ymin === ymax){
      if(ymin === 0){ ymin = -1; ymax = 1; }
      else { const pad = Math.abs(ymin) * 0.06; ymin -= pad; ymax += pad; }
    } else { const span = ymax - ymin; const pad = span * 0.06; ymin -= pad; ymax += pad; }
    return [ymin, ymax];
  }

  pairs.forEach(function(pair){
    const barDiv = document.getElementById(pair.bar);
    const diffDiv = document.getElementById(pair.diff);
    const xArray = pair.x || [];
    const ys = pair.ys || [[],[]];
    const diffs = pair.diffs || [];

    function onRelayout(sourceId, targetId, eventData){
      if(eventData && eventData['_sync_source']) return;
      const xr = extractXRange(eventData);
      const idxRange = visibleIndexRangeFromX(xr, xArray);
      if(idxRange){
                // Compute only the percent-diff range; for bar charts prefer Plotly's
                // built-in autorange so axis ticks are nicely rounded and match the
                // default behaviour seen in the older notebook.
                const diffRange = computeRangeForValues([diffs], idxRange);
                try{
                    let payload = {'xaxis.range': xr, '_sync_source': sourceId};
                    if(targetId.startsWith('plot_diff_')){
                        if(diffRange) payload['yaxis.range'] = diffRange;
                        else payload['yaxis.autorange'] = true;
                    } else {
                        // For bar charts, ask Plotly to autorange (nicer tick placement).
                        payload['yaxis.autorange'] = true;
                    }
                    Plotly.relayout(targetId, payload);
                }catch(e){}
                try{
                    if(sourceId === pair.bar && diffDiv){
                        if(diffRange) Plotly.relayout(pair.diff, {'yaxis.range': diffRange, '_sync_source': sourceId});
                        else Plotly.relayout(pair.diff, {'yaxis.autorange': true, '_sync_source': sourceId});
                    }
                    if(sourceId === pair.diff && barDiv){
                        // When the percent-diff trace is the source, let the bar chart
                        // autorange rather than imposing an explicit numeric range.
                        try{
                            Plotly.relayout(pair.bar, {'yaxis.autorange': true, '_sync_source': sourceId});
                        }catch(e){}
                    }
                }catch(e){}
      } else {
        try{ Plotly.relayout(targetId, {'yaxis.autorange': true, '_sync_source': sourceId}); }catch(e){}
      }
    }

    if(barDiv && barDiv.on){ barDiv.on('plotly_relayout', function(eventData){ onRelayout(pair.bar, pair.diff, eventData); }); }
    if(diffDiv && diffDiv.on){ diffDiv.on('plotly_relayout', function(eventData){ onRelayout(pair.diff, pair.bar, eventData); }); }
  });
});
</script>
"""
    js_sync = js_sync.replace("REPLACE_PAIRS_JSON", json.dumps(plot_pairs))

    table_note = " (tables included)" if include_tables else " (charts only)"
    ignore_note = f" (ignored: {', '.join(IGNORE_TOKENS)})" if IGNORE_TOKENS else ""
    final_html = f"""<html>
<head>
  <title>Metrics comparison{table_note}{ignore_note}</title>
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
  <style>
    body {{ font-family: Arial; padding: 20px; }}
    .plot-container {{ margin-bottom: 60px; }}
    table {{ border-collapse: collapse; margin-bottom: 12px; }}
    th {{ background:#eee; }}
  </style>
</head>
<body>
  <h1>Plotly Metrics Comparison Report{table_note}{ignore_note}</h1>
  <p><strong>Baseline:</strong> {html.escape(label_baseline)}<br/><strong>Compare:</strong> {html.escape(label_compare)}</p>
  {"".join(f'<div class="plot-container">{plot}</div>' for plot in html_snippets)}
  {js_sync}
</body>
</html>"""

    Path(out_path).write_text(final_html, encoding="utf-8")
    print(f"✔ Report written to: {out_path}")

def main():
    p = argparse.ArgumentParser()
    p.add_argument("baseline", help="Baseline markdown summary (used for ordering)")
    p.add_argument("compare", help="Compare markdown summary")
    p.add_argument("-o", "--output", default="comparison_report.html")
    p.add_argument("--include-tables", action="store_true",
                   help="Include numeric HTML tables for each metric group in the report.")
    p.add_argument("--baseline-label", help="Custom label for baseline data (default: use file path)")
    p.add_argument("--compare-label", help="Custom label for compare data (default: use file path)")
    p.add_argument("--data-spec", help="Path to superbench_data_spec.json file for unit information")
    args = p.parse_args()
    if not Path(args.baseline).exists():
        print("Baseline file not found:", args.baseline, file=sys.stderr); sys.exit(2)
    if not Path(args.compare).exists():
        print("Compare file not found:", args.compare, file=sys.stderr); sys.exit(2)
    build_report(args.baseline, args.compare, args.output,
                include_tables=args.include_tables,
                baseline_label=args.baseline_label,
                compare_label=args.compare_label,
                data_spec=args.data_spec)

if __name__ == "__main__":
    main()