-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathexplorer.html
More file actions
3159 lines (3002 loc) · 212 KB
/
Copy pathexplorer.html
File metadata and controls
3159 lines (3002 loc) · 212 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>IKP Explorer</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/monaco-editor@0.52.0/min/vs/editor/editor.main.css">
<style>
:root {
--bg:#ffffff; --bg2:#f6f8fa; --bg3:#eaeef2; --border:#d0d7de;
--text:#1f2328; --dim:#656d76; --bright:#1f2328;
--accent:#0969da; --green:#1a7f37; --orange:#bc4c00;
--purple:#8250df; --yellow:#9a6700; --red:#cf222e;
}
*{box-sizing:border-box;margin:0;padding:0;}
html,body{height:100%;overflow:hidden;background:var(--bg);color:var(--text);
font-family:'JetBrains Mono','Fira Code','SF Mono',Consolas,monospace;font-size:13px;}
/* Header */
.hdr{background:var(--bg2);border-bottom:1px solid var(--border);padding:6px 16px;
display:flex;align-items:center;gap:12px;flex-shrink:0;height:38px;}
.hdr h1{font-size:14px;color:var(--accent);font-weight:700;white-space:nowrap;}
.hdr .fname{color:var(--bright);font-size:13px;}
.hdr .sep{color:var(--dim);}
.hdr .info{color:var(--dim);font-size:11px;margin-left:auto;display:flex;gap:14px;}
.hdr .info b{color:var(--accent);font-weight:600;}
/* Main area */
.main{display:flex;height:calc(100vh - 38px);}
/* Panels */
.panel{display:flex;flex-direction:column;overflow:hidden;min-width:80px;}
.panel-hdr{background:var(--bg2);border-bottom:1px solid var(--border);padding:4px 10px;
font-size:11px;color:var(--dim);font-weight:600;text-transform:uppercase;letter-spacing:.5px;
display:flex;align-items:center;gap:8px;flex-shrink:0;height:30px;}
.panel-hdr .active{color:var(--bright);border-bottom:2px solid var(--accent);}
.asm-tab{cursor:pointer;padding:2px 6px;}
.asm-tab:hover{color:var(--text);}
.asm-tab.active{color:var(--bright);}
.asm-status{margin-left:auto;font-weight:400;text-transform:none;letter-spacing:0;font-size:10px;}
.editor-wrap{flex:1;overflow:hidden;}
/* Gutter split */
.gutter{background:var(--border);cursor:col-resize;position:relative;
transition:background .15s;flex-shrink:0;}
.gutter-vertical{cursor:row-resize !important;}
.gutter:hover{background:var(--accent);}
.gutter::after{content:'';position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);
width:2px;height:30px;background:var(--dim);border-radius:2px;transition:background .15s;}
.gutter-vertical::after{width:30px !important;height:2px !important;}
.gutter:hover::after{background:var(--bright);}
.split{overflow:hidden;}
/* Right panel: metrics */
.metrics{display:flex;flex-direction:column;overflow:hidden;}
.tabs{display:flex;background:var(--bg2);border-bottom:1px solid var(--border);flex-shrink:0;overflow-x:auto;}
.tab{padding:5px 9px;font-size:11px;color:var(--dim);cursor:pointer;border-bottom:2px solid transparent;
font-weight:500;white-space:nowrap;}
.tab:hover{color:var(--text);}
.tab.active{color:var(--bright);border-bottom-color:var(--accent);}
.mscroll{flex:1;overflow-y:auto;padding:10px;}
.tc{display:none;}
.tc.active{display:block;}
/* Metric components */
.msec{margin-bottom:14px;}
.msec h4{font-size:10px;color:var(--dim);text-transform:uppercase;letter-spacing:.5px;
margin-bottom:5px;padding-bottom:3px;border-bottom:1px solid var(--border);}
.mrow{display:flex;justify-content:space-between;padding:2px 0;font-size:12px;}
.mrow .mn{color:var(--dim);overflow:hidden;text-overflow:ellipsis;white-space:nowrap;max-width:60%;}
.mrow .mv{color:var(--accent);font-weight:600;font-variant-numeric:tabular-nums;}
.mrow.hl .mn{color:var(--text);} .mrow.hl .mv{color:var(--bright);font-size:13px;}
.cards{display:grid;grid-template-columns:1fr 1fr;gap:6px;margin-bottom:10px;}
.cards-3{grid-template-columns:1fr 1fr 1fr;}
.cards-4{grid-template-columns:1fr 1fr 1fr 1fr;}
.card{background:var(--bg);border:1px solid var(--border);border-radius:6px;padding:8px;text-align:center;}
.card .cv{font-size:20px;font-weight:700;color:var(--accent);}
.card .cl{font-size:9px;color:var(--dim);text-transform:uppercase;margin-top:2px;}
.bar-row{display:flex;align-items:center;gap:5px;margin-bottom:3px;font-size:11px;}
.bar-label{width:80px;text-align:right;color:var(--dim);overflow:hidden;text-overflow:ellipsis;
white-space:nowrap;flex-shrink:0;}
.bar-track{flex:1;height:14px;background:var(--bg);border-radius:3px;overflow:hidden;}
.bar-fill{height:100%;border-radius:3px;transition:width .2s;}
.bar-val{width:50px;text-align:right;color:var(--accent);font-weight:600;flex-shrink:0;
font-variant-numeric:tabular-nums;}
.empty{color:var(--dim);font-size:12px;text-align:center;padding:30px 12px;}
/* Region items */
.ri{display:flex;align-items:center;gap:6px;padding:5px 6px;border-radius:4px;cursor:pointer;margin-bottom:2px;}
.ri:hover{background:var(--bg3);}
.ri.active{background:rgba(88,166,255,.12);}
.ri-dot{width:8px;height:8px;border-radius:50%;flex-shrink:0;}
.ri-name{flex:1;font-size:12px;}
.ri-val{font-size:11px;color:var(--dim);}
/* ECharts containers */
.chart{width:100%;margin:6px 0;}
/* Info icons — CSS-rendered "i" circle, no Unicode dependency */
.info-icon{cursor:pointer;display:inline-flex;align-items:center;justify-content:center;
width:13px;height:13px;border-radius:50%;border:1.5px solid var(--dim);
font-size:9px;font-weight:700;font-style:italic;font-family:Georgia,serif;
color:var(--dim);margin-left:3px;opacity:0.7;vertical-align:middle;
line-height:1;user-select:none;position:relative;flex-shrink:0;}
.info-icon:hover{opacity:1;color:var(--accent);border-color:var(--accent);}
/* Tooltip popup on click */
.info-tip{display:none;position:fixed;background:var(--bright);color:#fff;font-size:11px;
font-style:normal;font-weight:400;font-family:inherit;padding:8px 10px;border-radius:6px;
max-width:300px;min-width:200px;z-index:999999;line-height:1.5;
box-shadow:0 4px 12px rgba(0,0,0,.2);pointer-events:auto;text-align:left;}
.info-icon.show .info-tip{display:block;}
/* Bottleneck badges */
.badge{display:inline-block;padding:2px 8px;border-radius:10px;font-size:10px;font-weight:600;
text-transform:uppercase;letter-spacing:.3px;}
.badge-compute{background:#dafbe1;color:#1a7f37;}
.badge-memory{background:#ddf4ff;color:#0969da;}
.badge-branch{background:#fff8c5;color:#9a6700;}
.badge-balanced{background:#eaeef2;color:#656d76;}
.badge-warn{background:#fff1e5;color:#bc4c00;}
.badge-ok{background:#dafbe1;color:#1a7f37;}
.badge-miss{background:#ffebe9;color:#cf222e;}
.badge-present{background:#dafbe1;color:#1a7f37;}
/* Section collapse */
.sec-hdr{cursor:pointer;user-select:none;display:flex;align-items:center;gap:4px;}
.sec-hdr:hover h4{color:var(--accent);}
.sec-hdr .arrow{font-size:8px;transition:transform .15s;}
.sec-hdr.collapsed .arrow{transform:rotate(-90deg);}
.sec-body{overflow:hidden;transition:max-height .2s;}
.sec-body.collapsed{max-height:0 !important;padding:0;margin:0;}
/* Percentile table */
.ptable{width:100%;font-size:10px;border-collapse:collapse;margin:6px 0;}
.ptable th,.ptable td{padding:3px 5px;text-align:right;border-bottom:1px solid var(--border);}
.ptable th{color:var(--dim);font-weight:600;text-align:right;}
.ptable td{font-variant-numeric:tabular-nums;}
.ptable .warn{background:#fff8c5;}
/* Efficiency bar */
.eff-bar{height:20px;border-radius:4px;position:relative;overflow:hidden;margin:4px 0;}
.eff-fill{height:100%;border-radius:4px;transition:width .3s;}
.eff-label{position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
font-size:11px;font-weight:600;color:var(--bright);}
/* Monaco line decorations */
.line-hot{background:rgba(207,34,46,.10) !important;}
.line-warm{background:rgba(154,103,0,.08) !important;}
.line-cold{background:rgba(9,105,218,.06) !important;}
.line-selected{background:rgba(9,105,218,.14) !important;}
.line-flash{animation:region-flash .8s ease-out !important;}
@keyframes region-flash{0%{background:rgba(9,105,218,.30)}100%{background:transparent}}
.asm-highlight{background:rgba(88,166,255,.15) !important;}
.region-bar{width:3px !important;margin-left:2px !important;}
.region-bar-0{background:#656d76 !important;}
.region-bar-1{background:#1a7f37 !important;}
.region-bar-2{background:#0969da !important;}
.region-bar-3{background:#8250df !important;}
.region-bar-4{background:#bc4c00 !important;}
.region-bar-5{background:#9a6700 !important;}
.region-bar-6{background:#cf222e !important;}
.region-bar-7{background:#6639ba !important;}
/* Region label in glyph margin */
.region-glyph{font-size:8px !important;line-height:20px !important;text-align:center;
display:flex !important;align-items:center;justify-content:center;opacity:0.7;
font-weight:600;letter-spacing:-0.3px;color:var(--dim) !important;}
/* Source panel legend */
.src-legend{display:flex;gap:10px;padding:2px 8px;font-size:9px;color:var(--dim);
border-top:1px solid var(--border);flex-wrap:wrap;align-items:center;flex-shrink:0;}
.src-legend .leg-item{display:flex;align-items:center;gap:3px;cursor:pointer;}
.src-legend .leg-item:hover{color:var(--accent);}
.src-legend .leg-dot{width:8px;height:8px;border-radius:50%;flex-shrink:0;}
.src-legend .leg-heat{display:inline-block;width:18px;height:8px;border-radius:2px;}
/* Load data button */
.load-btn{background:var(--bg3);border:1px solid var(--border);border-radius:4px;
padding:2px 8px;font-size:10px;cursor:pointer;color:var(--dim);white-space:nowrap;}
.load-btn:hover{color:var(--accent);border-color:var(--accent);}
/* Scrollbar */
::-webkit-scrollbar{width:8px;height:8px;}
::-webkit-scrollbar-track{background:var(--bg);}
::-webkit-scrollbar-thumb{background:var(--border);border-radius:4px;}
</style>
</head>
<body>
<div class="hdr">
<h1>IKP Explorer</h1>
<span class="sep">/</span>
<span class="fname" id="fileName"></span>
<div class="info" id="headerInfo"></div>
<button class="load-btn" id="loadBtn" title="Load a different profiler dataset">Load Data...</button>
<input type="file" id="loadFile" accept=".html,.json" style="display:none">
</div>
<div class="main" id="main">
<!-- Source panel -->
<div class="panel" id="srcPanel">
<div class="panel-hdr"><span>CUDA Source</span></div>
<div class="editor-wrap" id="srcWrap"></div>
<div class="src-legend" id="srcLegend"></div>
</div>
<!-- Assembly panel: PTX + SASS stacked -->
<div class="panel" id="asmPanel">
<div id="ptxPane" style="display:flex;flex-direction:column;overflow:hidden">
<div class="panel-hdr" style="display:flex;align-items:center;gap:6px">
<span>PTX</span>
<span class="asm-status" id="ptxStatus" style="flex:1"></span>
</div>
<div class="editor-wrap" id="ptxWrap"></div>
</div>
<div id="sassPane" style="display:flex;flex-direction:column;overflow:hidden">
<div class="panel-hdr" style="display:flex;align-items:center;gap:6px">
<span>SASS</span>
<span class="asm-status" id="sassStatus" style="flex:1"></span>
</div>
<div class="editor-wrap" id="sassWrap"></div>
</div>
</div>
<!-- Metrics panel -->
<div class="panel metrics" id="metPanel">
<div class="tabs" id="tabBar">
<span class="tab active" data-t="ov">Overview</span>
<span class="tab" data-t="line">Line</span>
<span class="tab" data-t="region">Regions</span>
<span class="tab" data-t="exec">Execution</span>
<span class="tab" data-t="mem">Memory</span>
<span class="tab" data-t="stalls">Stalls</span>
<span class="tab" data-t="sys">System</span>
<span class="tab" data-t="trace">Trace</span>
</div>
<div class="mscroll">
<div class="tc active" id="tc-ov"><div id="ovCt"></div></div>
<div class="tc" id="tc-line"><div id="lineMet"><div class="empty">Click a source line to see metrics</div></div></div>
<div class="tc" id="tc-region"><div id="regionList"></div><div id="regionDet"></div></div>
<div class="tc" id="tc-exec"><div id="execCt"></div></div>
<div class="tc" id="tc-mem"><div id="memCt"></div></div>
<div class="tc" id="tc-stalls"><div id="stallsCt"></div></div>
<div class="tc" id="tc-sys"><div id="sysCt"></div></div>
<div class="tc" id="tc-trace"><div id="traceCt"></div></div>
</div>
</div>
</div>
<div id="loading" style="position:fixed;inset:0;background:var(--bg);display:flex;align-items:center;
justify-content:center;z-index:999;color:var(--dim);font-size:14px;">
Loading Monaco Editor from CDN...
</div>
<!-- CDN -->
<script src="https://cdn.jsdelivr.net/npm/split.js@1.6.5/dist/split.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/echarts@5.5.1/dist/echarts.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/monaco-editor@0.52.0/min/vs/loader.min.js"></script>
<script>
// ── Data ──
let D = {"source":{"code":"// nsys_demo.cu \u2014 Minimal tiled GEMM for NSys + IKP trace demo.\n//\n// Single kernel launch, no warmup, no benchmark \u2014 produces a clean\n// trace where every NSys event maps directly to the IKP-profiled kernel.\n//\n// Build:\n// nvcc -O3 -std=c++17 -arch=sm_90a -lineinfo -I ../../include \\\n// nsys_demo.cu -o nsys_demo\n//\n// Run (under nsys):\n// nsys profile --trace=cuda ./nsys_demo --out=trace.json\n\n#include <cuda_runtime.h>\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <string>\n#include <vector>\n\n#include <intra_kernel_profiler/intra_kernel_profiler.hpp>\n\nconstexpr int BM = 64, BN = 64, BK = 16;\nconstexpr int TM = 4, TN = 4;\nconstexpr int THREADS = (BM / TM) * (BN / TN);\nconstexpr uint32_t kWarpsPerBlock = (THREADS + 31) / 32;\nconstexpr uint32_t PROFILE_CAP = 4096;\n\nenum Region : uint16_t {\n kTotal = 1,\n kLoadA = 2,\n kLoadB = 3,\n kCompute = 4,\n kStore = 5,\n};\n\n__global__ void gemm_kernel(\n const float* __restrict__ A, const float* __restrict__ B,\n float* __restrict__ C, int M, int N, int K, int tiles_n,\n intra_kernel_profiler::trace::GlobalBuffer prof)\n{\n IKP_TRACE_CTX_TYPE(PROFILE_CAP, kWarpsPerBlock) ctx;\n IKP_TRACE_CTX_INIT(ctx);\n\n const int bx = blockIdx.x % tiles_n, by = blockIdx.x / tiles_n;\n const int row0 = by * BM, col0 = bx * BN;\n const int tx = threadIdx.x % (BN / TN), ty = threadIdx.x / (BN / TN);\n\n __shared__ float sA[BM][BK], sB[BK][BN];\n float acc[TM][TN] = {};\n\n IKP_TRACE_REC_B(ctx, prof, kTotal);\n\n for (int t = 0; t < (K + BK - 1) / BK; ++t) {\n IKP_TRACE_REC_B(ctx, prof, kLoadA);\n for (int i = threadIdx.x; i < BM * BK; i += THREADS) {\n int r = i / BK, c = i % BK;\n int gr = row0 + r, gc = t * BK + c;\n sA[r][c] = (gr < M && gc < K) ? A[gr * K + gc] : 0.f;\n }\n IKP_TRACE_REC_E(ctx, prof, kLoadA);\n\n IKP_TRACE_REC_B(ctx, prof, kLoadB);\n for (int i = threadIdx.x; i < BK * BN; i += THREADS) {\n int r = i / BN, c = i % BN;\n int gr = t * BK + r, gc = col0 + c;\n sB[r][c] = (gr < K && gc < N) ? B[gr * N + gc] : 0.f;\n }\n IKP_TRACE_REC_E(ctx, prof, kLoadB);\n\n __syncthreads();\n\n IKP_TRACE_REC_B(ctx, prof, kCompute);\n for (int k = 0; k < BK; ++k)\n for (int i = 0; i < TM; ++i)\n for (int j = 0; j < TN; ++j)\n acc[i][j] += sA[ty * TM + i][k] * sB[k][tx * TN + j];\n IKP_TRACE_REC_E(ctx, prof, kCompute);\n __syncthreads();\n }\n\n IKP_TRACE_REC_B(ctx, prof, kStore);\n for (int i = 0; i < TM; ++i)\n for (int j = 0; j < TN; ++j) {\n int gr = row0 + ty * TM + i, gc = col0 + tx * TN + j;\n if (gr < M && gc < N) C[gr * N + gc] = acc[i][j];\n }\n IKP_TRACE_REC_E(ctx, prof, kStore);\n\n IKP_TRACE_REC_E(ctx, prof, kTotal);\n IKP_TRACE_CTX_FLUSH(ctx, prof);\n}\n\nstatic int get_int(int argc, char** argv, const char* key, int def) {\n std::string pfx = std::string(\"--\") + key + \"=\";\n for (int i = 1; i < argc; ++i)\n if (std::strncmp(argv[i], pfx.c_str(), pfx.size()) == 0)\n return std::atoi(argv[i] + pfx.size());\n return def;\n}\nstatic const char* get_str(int argc, char** argv, const char* key, const char* def) {\n std::string pfx = std::string(\"--\") + key + \"=\";\n for (int i = 1; i < argc; ++i)\n if (std::strncmp(argv[i], pfx.c_str(), pfx.size()) == 0)\n return argv[i] + pfx.size();\n return def;\n}\n\nint main(int argc, char** argv) {\n const int M = get_int(argc, argv, \"m\", 2048);\n const int N = get_int(argc, argv, \"n\", 2048);\n const int K = get_int(argc, argv, \"k\", 2048);\n const char* out = get_str(argc, argv, \"out\", \"trace.json\");\n\n const int tiles_m = (M + BM - 1) / BM;\n const int tiles_n = (N + BN - 1) / BN;\n const int nblocks = tiles_m * tiles_n;\n\n // Allocate & init\n size_t szA = (size_t)M * K, szB = (size_t)K * N, szC = (size_t)M * N;\n std::vector<float> hA(szA), hB(szB);\n for (size_t i = 0; i < szA; ++i) hA[i] = float(i % 101) / 101.f;\n for (size_t i = 0; i < szB; ++i) hB[i] = float(i % 103) / 103.f;\n\n float *dA, *dB, *dC;\n cudaMalloc(&dA, szA * sizeof(float));\n cudaMalloc(&dB, szB * sizeof(float));\n cudaMalloc(&dC, szC * sizeof(float));\n cudaMemcpy(dA, hA.data(), szA * sizeof(float), cudaMemcpyHostToDevice);\n cudaMemcpy(dB, hB.data(), szB * sizeof(float), cudaMemcpyHostToDevice);\n cudaMemset(dC, 0, szC * sizeof(float));\n\n // IKP profiler\n intra_kernel_profiler::trace::HostSession sess;\n sess.set_region_names({\"_outside\", \"total\", \"load_A\", \"load_B\", \"compute\", \"store\"});\n sess.set_block_filter({0u, 1u, 2u, 3u});\n sess.init(PROFILE_CAP, nblocks, THREADS);\n\n // Single profiled kernel launch \u2014 this is what NSys and IKP both capture.\n gemm_kernel<<<nblocks, THREADS>>>(dA, dB, dC, M, N, K, tiles_n, sess.global_buffer());\n cudaDeviceSynchronize();\n\n // Write IKP trace\n intra_kernel_profiler::trace::TraceWriteOptions opt;\n opt.emit_summary_json = true;\n sess.write_trace(out, opt);\n\n cudaFree(dA); cudaFree(dB); cudaFree(dC);\n printf(\"Done: %s\\n\", out);\n return 0;\n}\n","path":"nsys_demo.cu"},"sass":{"text":"","lineMap":{}},"ptx":{"text":"","lineMap":{}},"perLine":{},"regions":{},"labels":{"1":"total","2":"load_A","3":"load_B","4":"compute","5":"store"},"colors":{"0":"#656d76","1":"#1a7f37","2":"#0969da","3":"#8250df","4":"#bc4c00","5":"#9a6700","6":"#cf222e","7":"#6639ba"},"trace":[{"name":"total","count":32,"mean":313426.0,"cv":0.000361,"min":313184.0,"max":313632.0,"percentiles":{"p5":313279.2,"p10":313281.066667,"p15":313310.933333,"p20":313312.8,"p25":313342.375,"p30":313343.775,"p35":313373.35,"p40":313376.15,"p45":313409.225,"p50":313410.625,"p55":313440.55,"p60":313502.6,"p65":313503.4,"p70":313504.2,"p75":313505.0,"p80":313505.8,"p85":313536.1,"p90":313568.3,"p95":313629.2,"p99":313631.44},"var_dur_pop":12412.0,"var_dur_sample":12812.387097,"hist":[0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625],"hist_min":313184.0,"hist_max":313632.0,"hist_bins":128},{"name":"load_A","count":4096,"mean":518.390625,"cv":0.341514,"min":320.0,"max":2112.0,"percentiles":{"p5":376.195246,"p10":380.739144,"p15":385.283043,"p20":389.826941,"p25":408.002903,"p30":412.164296,"p35":416.325689,"p40":450.450909,"p45":457.898182,"p50":481.297872,"p55":505.025387,"p60":513.902167,"p65":551.876259,"p70":576.17491,"p75":600.498024,"p80":611.83083,"p85":639.42069,"p90":681.432195,"p95":730.698851,"p99":926.76},"var_dur_pop":31334.659912,"var_dur_sample":31342.311844,"hist":[0.001953,0.0,0.045898,0.0,0.154053,0.0,0.168213,0.0,0.0,0.093994,0.0,0.068848,0.0,0.078857,0.0,0.0,0.067871,0.0,0.068115,0.0,0.061768,0.0,0.049561,0.0,0.0,0.050049,0.0,0.033691,0.0,0.02124,0.0,0.0,0.01123,0.0,0.008789,0.0,0.002441,0.0,0.001465,0.0,0.0,0.001465,0.0,0.001465,0.0,0.001221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000732,0.0,0.001221,0.0,0.000488,0.0,0.000732,0.0,0.0,0.000488,0.0,0.001709,0.0,0.001221,0.0,0.001221],"hist_min":320.0,"hist_max":2112.0,"hist_bins":128},{"name":"load_B","count":4096,"mean":661.664062,"cv":0.206791,"min":352.0,"max":1344.0,"percentiles":{"p5":418.628912,"p10":452.67619,"p15":508.566316,"p20":540.397093,"p25":571.613725,"p30":608.706724,"p35":614.179828,"p40":642.292206,"p45":670.163061,"p50":674.350923,"p55":701.589446,"p60":704.973667,"p65":708.357889,"p70":735.740945,"p75":763.196686,"p80":767.770749,"p85":796.3417,"p90":826.647576,"p95":868.5125,"p99":989.736429},"var_dur_pop":18716.918396,"var_dur_sample":18721.489072,"hist":[0.004883,0.0,0.0,0.0,0.023682,0.0,0.0,0.0,0.035889,0.0,0.0,0.0,0.035889,0.0,0.0,0.0,0.040283,0.0,0.0,0.0,0.046387,0.0,0.0,0.0,0.041992,0.0,0.0,0.0,0.062256,0.0,0.0,0.0,0.0,0.070801,0.0,0.0,0.0,0.083008,0.0,0.0,0.0,0.092529,0.0,0.0,0.0,0.114502,0.0,0.0,0.0,0.093018,0.0,0.0,0.0,0.084717,0.0,0.0,0.0,0.060303,0.0,0.0,0.0,0.040283,0.0,0.0,0.0,0.0,0.030273,0.0,0.0,0.0,0.015381,0.0,0.0,0.0,0.008301,0.0,0.0,0.0,0.004639,0.0,0.0,0.0,0.003418,0.0,0.0,0.0,0.001953,0.0,0.0,0.0,0.000732,0.0,0.0,0.0,0.001953,0.0,0.0,0.0,0.0,0.001465,0.0,0.0,0.0,0.000488,0.0,0.0,0.0,0.000488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000244,0.0,0.0,0.0,0.000244],"hist_min":352.0,"hist_max":1344.0,"hist_bins":128},{"name":"compute","count":4096,"mean":1027.125,"cv":0.157658,"min":512.0,"max":1568.0,"percentiles":{"p5":798.364851,"p10":832.779899,"p15":863.455464,"p20":893.456761,"p25":898.216197,"p30":927.908036,"p35":957.691743,"p40":962.858716,"p45":992.939836,"p50":998.479508,"p55":1028.429755,"p60":1050.515672,"p65":1081.610504,"p70":1088.709664,"p75":1121.910714,"p80":1181.024302,"p85":1215.547059,"p90":1253.022174,"p95":1310.714356,"p99":1437.281923},"var_dur_pop":26216.234375,"var_dur_sample":26222.636386,"hist":[0.001465,0.0,0.0,0.000732,0.0,0.0,0.0,0.001221,0.0,0.0,0.0,0.001709,0.0,0.0,0.0,0.002197,0.0,0.0,0.0,0.002441,0.0,0.0,0.0,0.00293,0.0,0.0,0.0,0.006348,0.0,0.0,0.0,0.013428,0.0,0.0,0.024658,0.0,0.0,0.0,0.048584,0.0,0.0,0.0,0.07373,0.0,0.0,0.0,0.08667,0.0,0.0,0.0,0.082031,0.0,0.0,0.0,0.079834,0.0,0.0,0.0,0.074463,0.0,0.0,0.0,0.07959,0.0,0.0,0.06543,0.0,0.0,0.0,0.058105,0.0,0.0,0.0,0.047852,0.0,0.0,0.0,0.04248,0.0,0.0,0.0,0.043701,0.0,0.0,0.0,0.037354,0.0,0.0,0.0,0.028076,0.0,0.0,0.0,0.024902,0.0,0.0,0.024658,0.0,0.0,0.0,0.016113,0.0,0.0,0.0,0.010254,0.0,0.0,0.0,0.008057,0.0,0.0,0.0,0.006348,0.0,0.0,0.0,0.001709,0.0,0.0,0.0,0.001953,0.0,0.0,0.0,0.000244,0.0,0.0,0.000732],"hist_min":512.0,"hist_max":1568.0,"hist_bins":128},{"name":"store","count":32,"mean":568.0,"cv":0.138739,"min":352.0,"max":704.0,"percentiles":{"p5":416.9,"p10":478.775,"p15":480.975,"p20":512.4625,"p25":513.5625,"p30":542.1625,"p35":543.2625,"p40":544.3625,"p45":575.52,"p50":576.4,"p55":577.28,"p60":608.3,"p65":609.033333,"p70":609.766667,"p75":610.5,"p80":638.733333,"p85":639.466667,"p90":640.2,"p95":701.8,"p99":703.56},"var_dur_pop":6016.0,"var_dur_sample":6210.064516,"hist":[0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625],"hist_min":352.0,"hist_max":704.0,"hist_bins":128}],"traceMeta":{"blocks":1024,"warps_per_block":8,"trace_file":"gemm_trace.json","by_block_warp":{"region_total":{"region":1,"name":"total","by_block_warp":[{"block":0,"warp":0,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0},{"block":0,"warp":1,"count":1,"mean_dur":313568.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313568.0,"max_dur":313568.0},{"block":0,"warp":2,"count":1,"mean_dur":313632.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313632.0,"max_dur":313632.0},{"block":0,"warp":3,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0},{"block":0,"warp":4,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0},{"block":0,"warp":5,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0},{"block":0,"warp":6,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0},{"block":0,"warp":7,"count":1,"mean_dur":313600.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313600.0,"max_dur":313600.0},{"block":1,"warp":0,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0},{"block":1,"warp":1,"count":1,"mean_dur":313536.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313536.0,"max_dur":313536.0},{"block":1,"warp":2,"count":1,"mean_dur":313440.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313440.0,"max_dur":313440.0},{"block":1,"warp":3,"count":1,"mean_dur":313344.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313344.0,"max_dur":313344.0},{"block":1,"warp":4,"count":1,"mean_dur":313312.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313312.0,"max_dur":313312.0},{"block":1,"warp":5,"count":1,"mean_dur":313376.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313376.0,"max_dur":313376.0},{"block":1,"warp":6,"count":1,"mean_dur":313632.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313632.0,"max_dur":313632.0},{"block":1,"warp":7,"count":1,"mean_dur":313312.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313312.0,"max_dur":313312.0},{"block":2,"warp":0,"count":1,"mean_dur":313344.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313344.0,"max_dur":313344.0},{"block":2,"warp":1,"count":1,"mean_dur":313408.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313408.0,"max_dur":313408.0},{"block":2,"warp":2,"count":1,"mean_dur":313408.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313408.0,"max_dur":313408.0},{"block":2,"warp":3,"count":1,"mean_dur":313408.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313408.0,"max_dur":313408.0},{"block":2,"warp":4,"count":1,"mean_dur":313344.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313344.0,"max_dur":313344.0},{"block":2,"warp":5,"count":1,"mean_dur":313344.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313344.0,"max_dur":313344.0},{"block":2,"warp":6,"count":1,"mean_dur":313280.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313280.0,"max_dur":313280.0},{"block":2,"warp":7,"count":1,"mean_dur":313280.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313280.0,"max_dur":313280.0},{"block":3,"warp":0,"count":1,"mean_dur":313312.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313312.0,"max_dur":313312.0},{"block":3,"warp":1,"count":1,"mean_dur":313440.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313440.0,"max_dur":313440.0},{"block":3,"warp":2,"count":1,"mean_dur":313408.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313408.0,"max_dur":313408.0},{"block":3,"warp":3,"count":1,"mean_dur":313536.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313536.0,"max_dur":313536.0},{"block":3,"warp":4,"count":1,"mean_dur":313280.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313280.0,"max_dur":313280.0},{"block":3,"warp":5,"count":1,"mean_dur":313184.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313184.0,"max_dur":313184.0},{"block":3,"warp":6,"count":1,"mean_dur":313376.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313376.0,"max_dur":313376.0},{"block":3,"warp":7,"count":1,"mean_dur":313504.0,"cv_dur":0.0,"var_dur_pop":0.0,"var_dur_sample":0.0,"min_dur":313504.0,"max_dur":313504.0}]},"region_load_A":{"region":2,"name":"load_A","by_block_warp":[{"block":0,"warp":0,"count":128,"mean_dur":513.75,"cv_dur":0.345724,"var_dur_pop":31300.9375,"var_dur_sample":31547.401575,"min_dur":352.0,"max_dur":1984.0},{"block":0,"warp":1,"count":128,"mean_dur":513.75,"cv_dur":0.338942,"var_dur_pop":30084.9375,"var_dur_sample":30321.826772,"min_dur":352.0,"max_dur":2048.0},{"block":0,"warp":2,"count":128,"mean_dur":512.75,"cv_dur":0.339617,"var_dur_pop":30087.4375,"var_dur_sample":30324.346457,"min_dur":352.0,"max_dur":1984.0},{"block":0,"warp":3,"count":128,"mean_dur":512.0,"cv_dur":0.332481,"var_dur_pop":28752.0,"var_dur_sample":28978.393701,"min_dur":352.0,"max_dur":1920.0},{"block":0,"warp":4,"count":128,"mean_dur":511.0,"cv_dur":0.324105,"var_dur_pop":27215.0,"var_dur_sample":27429.291339,"min_dur":320.0,"max_dur":1888.0},{"block":0,"warp":5,"count":128,"mean_dur":507.0,"cv_dur":0.319429,"var_dur_pop":26023.0,"var_dur_sample":26227.905512,"min_dur":320.0,"max_dur":1888.0},{"block":0,"warp":6,"count":128,"mean_dur":516.0,"cv_dur":0.335637,"var_dur_pop":29760.0,"var_dur_sample":29994.330709,"min_dur":320.0,"max_dur":1952.0},{"block":0,"warp":7,"count":128,"mean_dur":511.0,"cv_dur":0.321042,"var_dur_pop":26703.0,"var_dur_sample":26913.259843,"min_dur":352.0,"max_dur":1920.0},{"block":1,"warp":0,"count":128,"mean_dur":515.25,"cv_dur":0.337014,"var_dur_pop":29917.4375,"var_dur_sample":30153.007874,"min_dur":352.0,"max_dur":1984.0},{"block":1,"warp":1,"count":128,"mean_dur":517.75,"cv_dur":0.345249,"var_dur_pop":31702.9375,"var_dur_sample":31952.566929,"min_dur":352.0,"max_dur":2016.0},{"block":1,"warp":2,"count":128,"mean_dur":514.25,"cv_dur":0.348978,"var_dur_pop":31954.9375,"var_dur_sample":32206.551181,"min_dur":352.0,"max_dur":2016.0},{"block":1,"warp":3,"count":128,"mean_dur":517.5,"cv_dur":0.340117,"var_dur_pop":30737.75,"var_dur_sample":30979.779528,"min_dur":352.0,"max_dur":1920.0},{"block":1,"warp":4,"count":128,"mean_dur":513.0,"cv_dur":0.328487,"var_dur_pop":28175.0,"var_dur_sample":28396.850394,"min_dur":352.0,"max_dur":1888.0},{"block":1,"warp":5,"count":128,"mean_dur":515.75,"cv_dur":0.335314,"var_dur_pop":29673.9375,"var_dur_sample":29907.590551,"min_dur":352.0,"max_dur":1920.0},{"block":1,"warp":6,"count":128,"mean_dur":512.5,"cv_dur":0.333907,"var_dur_pop":29055.75,"var_dur_sample":29284.535433,"min_dur":352.0,"max_dur":1920.0},{"block":1,"warp":7,"count":128,"mean_dur":516.75,"cv_dur":0.334256,"var_dur_pop":29601.4375,"var_dur_sample":29834.519685,"min_dur":320.0,"max_dur":1952.0},{"block":2,"warp":0,"count":128,"mean_dur":527.75,"cv_dur":0.344686,"var_dur_pop":32831.9375,"var_dur_sample":33090.456693,"min_dur":352.0,"max_dur":2080.0},{"block":2,"warp":1,"count":128,"mean_dur":525.5,"cv_dur":0.336288,"var_dur_pop":30985.75,"var_dur_sample":31229.732283,"min_dur":352.0,"max_dur":2048.0},{"block":2,"warp":2,"count":128,"mean_dur":521.75,"cv_dur":0.340706,"var_dur_pop":31352.9375,"var_dur_sample":31599.811024,"min_dur":352.0,"max_dur":2048.0},{"block":2,"warp":3,"count":128,"mean_dur":520.25,"cv_dur":0.344353,"var_dur_pop":31843.9375,"var_dur_sample":32094.677165,"min_dur":352.0,"max_dur":2048.0},{"block":2,"warp":4,"count":128,"mean_dur":522.25,"cv_dur":0.342231,"var_dur_pop":31694.9375,"var_dur_sample":31944.503937,"min_dur":352.0,"max_dur":2080.0},{"block":2,"warp":5,"count":128,"mean_dur":521.75,"cv_dur":0.346738,"var_dur_pop":32472.9375,"var_dur_sample":32728.629921,"min_dur":352.0,"max_dur":2112.0},{"block":2,"warp":6,"count":128,"mean_dur":525.25,"cv_dur":0.350314,"var_dur_pop":33592.4375,"var_dur_sample":33856.944882,"min_dur":352.0,"max_dur":2112.0},{"block":2,"warp":7,"count":128,"mean_dur":520.75,"cv_dur":0.353946,"var_dur_pop":33707.4375,"var_dur_sample":33972.850394,"min_dur":352.0,"max_dur":2112.0},{"block":3,"warp":0,"count":128,"mean_dur":524.25,"cv_dur":0.355681,"var_dur_pop":34497.9375,"var_dur_sample":34769.574803,"min_dur":352.0,"max_dur":2080.0},{"block":3,"warp":1,"count":128,"mean_dur":527.25,"cv_dur":0.353235,"var_dur_pop":34415.4375,"var_dur_sample":34686.425197,"min_dur":352.0,"max_dur":2080.0},{"block":3,"warp":2,"count":128,"mean_dur":519.25,"cv_dur":0.355431,"var_dur_pop":33795.4375,"var_dur_sample":34061.543307,"min_dur":320.0,"max_dur":2048.0},{"block":3,"warp":3,"count":128,"mean_dur":527.0,"cv_dur":0.348686,"var_dur_pop":33503.0,"var_dur_sample":33766.80315,"min_dur":320.0,"max_dur":2048.0},{"block":3,"warp":4,"count":128,"mean_dur":517.25,"cv_dur":0.352092,"var_dur_pop":32908.4375,"var_dur_sample":33167.559055,"min_dur":352.0,"max_dur":2048.0},{"block":3,"warp":5,"count":128,"mean_dur":527.25,"cv_dur":0.357803,"var_dur_pop":35311.4375,"var_dur_sample":35589.480315,"min_dur":352.0,"max_dur":2112.0},{"block":3,"warp":6,"count":128,"mean_dur":517.75,"cv_dur":0.355296,"var_dur_pop":33574.9375,"var_dur_sample":33839.307087,"min_dur":320.0,"max_dur":2080.0},{"block":3,"warp":7,"count":128,"mean_dur":523.25,"cv_dur":0.356317,"var_dur_pop":34489.4375,"var_dur_sample":34761.007874,"min_dur":352.0,"max_dur":2112.0}]}}},"hotspots":null,"pcsamp":null,"instrexec":null,"locality":null,"sassProfiles":{},"cuptiPerRegion":{},"cuptiCoverage":{},"instrexecPerRegion":{},"pcsampPerRegion":{},"crossValidation":{},"profiles":[],"defs":{"inst_total":{"short":"Total Instructions","long":"Total warp-level instructions executed in this region (NVBit count). Each count = one warp executing one SASS instruction.","unit":"instr"},"inst_pred_off":{"short":"Predicated Off","long":"Instructions where all active threads had predication OFF (not executed). High values indicate compiler-generated predicated code with poor utilization.","unit":"instr"},"bb_exec":{"short":"BB Executions","long":"Total basic-block execution count across all warps. Indicates dynamic control flow activity.","unit":"count"},"gmem_load":{"short":"Global Loads","long":"Number of global memory load operations (warp-level). Each operation may touch multiple cache lines.","unit":"ops"},"gmem_store":{"short":"Global Stores","long":"Number of global memory store operations (warp-level).","unit":"ops"},"gmem_bytes":{"short":"Global Bytes","long":"Total bytes actually transferred for global memory operations, including over-fetch due to misalignment or non-coalesced access.","unit":"bytes"},"gmem_req_bytes":{"short":"Global Requested Bytes","long":"Bytes actually needed by the program (requested). Compare with gmem_bytes to find wasted bandwidth from poor coalescing.","unit":"bytes"},"gmem_inst_load_count":{"short":"Global Load Insts","long":"Count of unique global load instructions (static). Different from gmem_load which counts dynamic executions.","unit":"count"},"gmem_inst_store_count":{"short":"Global Store Insts","long":"Count of unique global store instructions (static).","unit":"count"},"gmem_sectors_32b":{"short":"Global Sectors","long":"32-byte sectors accessed in global memory. Ideal coalescing: 4 sectors per 128B cache line. Higher sector count relative to ops = poor coalescing.","unit":"sectors"},"gmem_unique_lines_est":{"short":"Unique Cache Lines","long":"Estimated unique 128B cache lines touched by global memory operations. Indicates memory footprint.","unit":"lines"},"gmem_sectors_per_inst_hist":{"short":"Sectors/Inst Distribution","long":"Distribution of sectors accessed per global memory instruction (0-32). Bin 4 = ideal 128B coalesced access. Higher bins = worse coalescing.","unit":"histogram"},"gmem_alignment_hist":{"short":"Alignment Distribution","long":"Distribution of access alignment offsets (8 bins). Bin 0 = aligned. Non-zero bins indicate misaligned accesses causing extra sector fetches.","unit":"histogram"},"gmem_stride_class_hist":{"short":"Stride Classification","long":"Access pattern classification: [0]=Sequential (best), [1]=Strided (moderate), [2]=Random (worst for coalescing).","unit":"histogram"},"smem_load":{"short":"Shared Loads","long":"Number of shared memory load operations (warp-level).","unit":"ops"},"smem_store":{"short":"Shared Stores","long":"Number of shared memory store operations (warp-level).","unit":"ops"},"smem_bytes":{"short":"Shared Bytes","long":"Total bytes transferred for shared memory operations.","unit":"bytes"},"smem_req_bytes":{"short":"Shared Requested Bytes","long":"Bytes actually needed for shared memory operations.","unit":"bytes"},"smem_inst_load_count":{"short":"Shared Load Insts","long":"Count of unique shared memory load instructions (static).","unit":"count"},"smem_inst_store_count":{"short":"Shared Store Insts","long":"Count of unique shared memory store instructions (static).","unit":"count"},"smem_bank_conflict_max_hist":{"short":"Bank Conflict Distribution","long":"Distribution of max bank conflict ways (0-32) across shared memory ops. Bin 0 = no conflict, bin 1 = no conflict (1-way). Bins >1 indicate N-way bank conflicts causing serialization.","unit":"histogram"},"smem_broadcast_count":{"short":"Broadcast Count","long":"Number of shared memory broadcast operations where all threads read the same address (free, no conflict).","unit":"count"},"smem_addr_span_hist":{"short":"Address Span Distribution","long":"Distribution of address span (range of addresses accessed by a warp in a single shared memory instruction). Narrow spans indicate good locality.","unit":"histogram"},"lmem_load":{"short":"Local Loads","long":"Number of local memory load operations. Local memory is backed by global memory (slow) and indicates register spills.","unit":"ops"},"lmem_store":{"short":"Local Stores","long":"Number of local memory store operations.","unit":"ops"},"lmem_bytes":{"short":"Local Bytes","long":"Total bytes transferred for local memory. High values indicate excessive register spilling.","unit":"bytes"},"lmem_req_bytes":{"short":"Local Requested Bytes","long":"Bytes actually needed for local memory operations.","unit":"bytes"},"reg_spill_suspected":{"short":"Register Spill","long":"Number of suspected register spill operations. Spills use local memory (backed by global DRAM) and significantly hurt performance.","unit":"count"},"spill_ld_local_inst":{"short":"Spill Loads","long":"Local memory load instructions attributed to register spills.","unit":"instr"},"spill_st_local_inst":{"short":"Spill Stores","long":"Local memory store instructions attributed to register spills.","unit":"instr"},"branch_div_hist":{"short":"Divergence Histogram","long":"Distribution of active lane count at divergent branches (33 bins, 0-32). Shows how many threads are active when the warp diverges. Bin 32 = no divergence.","unit":"histogram"},"branch_active_hist":{"short":"Active Lanes Histogram","long":"Distribution of active lane count across all branch executions (33 bins, 0-32). Bin 32 = all threads active = full SIMT utilization.","unit":"histogram"},"branch_active_avg_lanes":{"short":"Avg Active Lanes","long":"Average number of active lanes across all branch executions. 32 = perfect utilization, lower values indicate thread divergence or early exit.","unit":"lanes"},"branch_div_avg_active":{"short":"Divergent Avg Active","long":"Average active lanes specifically at divergent branches. Lower than branch_active_avg_lanes indicates divergence concentrates in low-activity warps.","unit":"lanes"},"branch_div_entropy":{"short":"Divergence Entropy","long":"Shannon entropy of the divergence histogram. 0 = perfectly uniform execution (all divergent branches have same active count). Higher = more varied divergence patterns.","unit":"bits"},"alu_fp32":{"short":"FP32 ALU","long":"32-bit floating-point arithmetic instructions (FADD, FMUL, FFMA, etc.).","unit":"instr"},"alu_int":{"short":"Integer ALU","long":"Integer arithmetic instructions (IADD, IMAD, etc.).","unit":"instr"},"tensor_wgmma":{"short":"Tensor/WGMMA","long":"Tensor core (HMMA/WGMMA) instructions. Primary compute units for matrix multiplication.","unit":"instr"},"ld_global":{"short":"Load Global","long":"Global memory load instructions.","unit":"instr"},"st_global":{"short":"Store Global","long":"Global memory store instructions.","unit":"instr"},"ld_shared":{"short":"Load Shared","long":"Shared memory load instructions.","unit":"instr"},"st_shared":{"short":"Store Shared","long":"Shared memory store instructions.","unit":"instr"},"ld_local":{"short":"Load Local","long":"Local memory load instructions (often register spills).","unit":"instr"},"st_local":{"short":"Store Local","long":"Local memory store instructions (often register spills).","unit":"instr"},"barrier":{"short":"Barrier Stall","long":"Warp stalled at a __syncthreads or named barrier.","unit":"samples"},"membar":{"short":"Memory Barrier Stall","long":"Warp stalled at a memory fence instruction.","unit":"samples"},"branch":{"short":"Branch","long":"Branch instructions (conditional and unconditional).","unit":"instr"},"call":{"short":"Call","long":"Function call instructions.","unit":"instr"},"ret":{"short":"Return","long":"Function return instructions.","unit":"instr"},"special":{"short":"Special","long":"Special-purpose instructions (S2R, MUFU, etc.).","unit":"instr"},"other":{"short":"Other","long":"Other instructions not classified above.","unit":"instr"},"inst_pipe":{"short":"Instruction Pipeline","long":"Distribution of instructions across hardware pipelines. Requires IKP_NVBIT_ENABLE_INST_PIPE=1.","unit":"instr"},"mean_dur":{"short":"Mean Duration","long":"Average duration across all invocations of this region.","unit":"ticks"},"cv_dur":{"short":"CV Duration","long":"Coefficient of variation of region timing. CV > 0.2 suggests significant jitter. CV > 1.0 indicates extreme variability.","unit":"ratio"},"min_dur":{"short":"Min Duration","long":"Minimum observed duration.","unit":"ticks"},"max_dur":{"short":"Max Duration","long":"Maximum observed duration.","unit":"ticks"},"var_dur_pop":{"short":"Population Variance","long":"Population variance of duration.","unit":"ticks^2"},"var_dur_sample":{"short":"Sample Variance","long":"Sample variance of duration (Bessel-corrected).","unit":"ticks^2"},"predication_rate":{"short":"Predication Rate","long":"Fraction of instructions with predication OFF. High rate = wasted issue slots.","unit":"%"},"gmem_efficiency":{"short":"Global Mem Efficiency","long":"Ratio of requested bytes to transferred bytes. 100% = perfect coalescing, <50% = severe waste.","unit":"%"},"branch_uniformity":{"short":"Branch Uniformity","long":"Fraction of branch executions with all 32 lanes active. 100% = no divergence.","unit":"%"},"compute_intensity":{"short":"Compute Intensity","long":"Ratio of compute instructions (FP32+tensor) to memory instructions. Higher = more compute-bound.","unit":"ratio"},"bottleneck":{"short":"Bottleneck","long":"Heuristic from NVBit instruction mix. compute_frac = (alu_fp32 + tensor_wgmma) / inst_total, memory_frac = (ld/st_global + ld/st_shared) / inst_total, branch_frac = branch / inst_total. If the largest fraction > 30% it is the bottleneck; otherwise 'balanced'.","unit":"class"},"heat":{"short":"Hotness","long":"Instruction hotness relative to the hottest source line. 100% = most-executed line.","unit":"%"},"smsp__sass_inst_executed":{"short":"Instructions Executed","long":"Total SASS instructions executed (CUPTI SASS metric). Counts warp-level instruction executions.","unit":"instr"},"smsp__sass_thread_inst_executed":{"short":"Thread-Insts Executed","long":"Total thread-level instruction executions. Divide by (inst_executed * 32) for thread utilization.","unit":"thread-instr"},"smsp__sass_thread_inst_executed_pred_on":{"short":"Thread-Insts Pred On","long":"Thread instructions where predicate was ON (actually executed).","unit":"thread-instr"},"no_instruction":{"short":"No Instruction","long":"Warp stalled because no instruction was available. Often indicates instruction cache miss or branch resolution delay.","unit":"samples"},"not_selected":{"short":"Not Selected","long":"Warp was eligible but not selected by the scheduler. Indicates scheduler contention.","unit":"samples"},"wait":{"short":"Wait","long":"Warp stalled on a fixed-latency instruction dependency (e.g., ALU result not ready).","unit":"samples"},"sleeping":{"short":"Sleeping","long":"Warp is sleeping (nanosleep instruction). Intentional delay.","unit":"samples"},"short_scoreboard":{"short":"Short Scoreboard","long":"Warp waiting for short-latency operation (shared memory, L1 cache). Common for shared memory bank conflicts.","unit":"samples"},"long_scoreboard":{"short":"Long Scoreboard","long":"Warp waiting for long-latency operation (global memory, L2 cache, texture). Dominant stall for memory-bound kernels.","unit":"samples"},"math_pipe_throttle":{"short":"Math Pipe Throttle","long":"Math execution pipeline is full. Indicates compute saturation.","unit":"samples"},"lg_throttle":{"short":"LG Throttle","long":"Local/global memory pipeline is full. Indicates memory bandwidth saturation.","unit":"samples"},"tex_throttle":{"short":"Texture Throttle","long":"Texture pipeline is full.","unit":"samples"},"mio_throttle":{"short":"MIO Throttle","long":"Miscellaneous IO pipeline throttle.","unit":"samples"},"drain":{"short":"Drain","long":"Warp is draining (completing pending operations before exit).","unit":"samples"},"branch_resolving":{"short":"Branch Resolving","long":"Warp stalled while a branch target is being resolved.","unit":"samples"},"dispatch_stall":{"short":"Dispatch Stall","long":"Instruction dispatch stall.","unit":"samples"},"imc_miss":{"short":"I-Cache Miss","long":"Instruction cache miss. Kernel is too large to fit in instruction cache.","unit":"samples"},"misc":{"short":"Miscellaneous","long":"Other stall reasons not categorized.","unit":"samples"},"selected":{"short":"Selected (Issued)","long":"Warp was selected and issued an instruction this cycle.","unit":"samples"},"warpgroup_arrive":{"short":"Warpgroup Arrive","long":"Warp stalled at warpgroup arrive barrier (Hopper+ architecture).","unit":"samples"},"mma":{"short":"MMA Stall","long":"Warp stalled waiting for matrix multiply-accumulate result.","unit":"samples"},"reuse_distance":{"short":"Reuse Distance","long":"Stack distance between consecutive accesses to the same cache line. Shorter distance = better temporal locality = higher cache hit rate.","unit":"histogram"},"working_set":{"short":"Working Set","long":"Number of unique cache lines accessed within a sliding window. Indicates memory footprint at different time scales.","unit":"lines"},"shared_line_ratio":{"short":"Shared Line Ratio","long":"Fraction of cache lines accessed by more than one warp. Higher = more inter-warp data sharing.","unit":"%"},"avg_warps_per_line":{"short":"Avg Warps/Line","long":"Average number of warps accessing each cache line. Higher = more sharing and potential for L1 cache contention.","unit":"warps"},"smsp__sass_sectors_mem_global":{"short":"Global Sectors","long":"32-byte sectors accessed for global memory. More sectors per instruction = worse coalescing. Compare with _ideal variant.","unit":"sectors"},"smsp__sass_sectors_mem_global_ideal":{"short":"Global Sectors (Ideal)","long":"Minimum sectors needed if accesses were perfectly coalesced. Ratio ideal/actual = coalescing efficiency.","unit":"sectors"},"smsp__sass_l1tex_pipe_lsu_wavefronts_mem_shared":{"short":"Shared Mem Wavefronts","long":"LSU pipe wavefronts for shared memory. More wavefronts = more passes due to bank conflicts.","unit":"wavefronts"},"smsp__sass_l1tex_pipe_lsu_wavefronts_mem_shared_ideal":{"short":"Shared Wavefronts (Ideal)","long":"Minimum wavefronts if no bank conflicts. Ratio ideal/actual = shared memory efficiency.","unit":"wavefronts"},"smsp__sass_branch_targets_threads_uniform":{"short":"Uniform Branches","long":"Branch target evaluations where all threads in a warp took the same path (no divergence).","unit":"count"},"smsp__sass_branch_targets_threads_divergent":{"short":"Divergent Branches","long":"Branch target evaluations where threads in a warp took different paths (warp divergence).","unit":"count"},"smsp__sass_inst_executed_op_branch":{"short":"Branch Insts","long":"Branch instructions executed (warp-level). From CUPTI SASS instruction mix profile.","unit":"instr"},"smsp__sass_inst_executed_op_global":{"short":"Global Mem Insts","long":"Global memory instructions executed (loads + stores, warp-level).","unit":"instr"},"smsp__sass_inst_executed_op_shared":{"short":"Shared Mem Insts","long":"Shared memory instructions executed (warp-level).","unit":"instr"},"smsp__sass_inst_executed_op_global_ld":{"short":"Global Load Insts","long":"Global memory load instructions executed (warp-level).","unit":"instr"},"smsp__sass_inst_executed_op_global_st":{"short":"Global Store Insts","long":"Global memory store instructions executed (warp-level).","unit":"instr"},"smsp__sass_inst_executed_op_shared_ld":{"short":"Shared Load Insts","long":"Shared memory load instructions executed (warp-level).","unit":"instr"},"smsp__sass_inst_executed_op_shared_st":{"short":"Shared Store Insts","long":"Shared memory store instructions executed (warp-level).","unit":"instr"},"smsp__sass_thread_inst_executed_op_branch":{"short":"Thread Branch Insts","long":"Thread-level branch instruction executions.","unit":"thread-instr"},"simt_utilization":{"short":"SIMT Utilization","long":"Active threads per warp instruction / 32. Measures lane utilization across the warp. NOT SM occupancy. 100% = all 32 lanes active on every instruction (no divergence). Lower values indicate branch divergence or partial warps.","unit":"%"},"predication_eff":{"short":"Predication Efficiency","long":"Fraction of thread-instructions where predicate was ON (actually executed). Lower = more predicated-off work from compiler-generated conditional code.","unit":"%"},"global_coalescing":{"short":"Global Mem Coalescing","long":"Ratio of ideal sectors to actual sectors for global memory. 100% = perfectly coalesced. Lower values mean scattered memory accesses fetching unnecessary data.","unit":"%"},"shared_efficiency":{"short":"Shared Mem Efficiency","long":"Ratio of ideal wavefronts to actual wavefronts for shared memory. < 100% indicates bank conflicts requiring multiple passes.","unit":"%"},"threads_executed":{"short":"Threads Executed","long":"Total thread-level executions from CUPTI instruction execution profiling. Sum of active threads across all warp executions of instructions in this region.","unit":"threads"},"executed":{"short":"Warp Executions","long":"Total warp-level instruction executions from CUPTI. Each count = one warp executing one instruction.","unit":"warps"},"inst_count":{"short":"Unique Instructions","long":"Number of unique PC addresses (static instructions) observed in this region by CUPTI.","unit":"count"},"notPredOffThreadsExecuted":{"short":"Not Pred-Off Threads","long":"Thread executions where predicate was not OFF. Ratio to threadsExecuted = thread-level utilization.","unit":"threads"},"percentile":{"short":"Percentile","long":"Duration at which N% of region invocations complete. P50 = median. P95/P99 = tail latency. Large P99/P50 ratio indicates occasional slow invocations.","unit":"ticks"},"mem_trace_records":{"short":"Memory Trace Records","long":"Raw memory access records captured by NVBit. Each record = one warp memory instruction with 32 lane addresses.","unit":"records"},"unique_cache_lines":{"short":"Unique Cache Lines","long":"Distinct 128B cache lines accessed per warp instruction. 1 = perfect coalescing, >4 = poor coalescing for 4B accesses.","unit":"lines"},"cache_line_span":{"short":"Cache Line Span","long":"Distance between the first and last cache line touched by a single warp instruction. Larger span = more scattered access pattern.","unit":"lines"},"coalescing_ratio":{"short":"Coalescing Ratio","long":"Minimum possible cache lines / actual cache lines per instruction. 100% = optimal. Lower = wasted bandwidth from non-contiguous access.","unit":"%"},"global_mem_frac":{"short":"Global Mem Fraction","long":"Fraction of warp instructions that are global memory ops (CUPTI). Higher = memory-intensive region.","unit":"%"},"shared_mem_frac":{"short":"Shared Mem Fraction","long":"Fraction of warp instructions that are shared memory ops (CUPTI).","unit":"%"},"branch_frac":{"short":"Branch Fraction","long":"Fraction of warp instructions that are branches (CUPTI). Higher = more control flow overhead.","unit":"%"},"compute_frac":{"short":"Compute Fraction","long":"Fraction of instructions that are pure compute (not memory or branch). Higher = compute-dominated region.","unit":"%"},"tma_frac":{"short":"TMA Fraction","long":"Fraction of instructions using Tensor Memory Accelerator (Hopper+). Present only in TMA-enabled kernels.","unit":"%"},"tensor_frac":{"short":"Tensor/WGMMA Fraction","long":"Fraction of instructions using Warp Group Matrix-Multiply-Accumulate (WGMMA/GMMA). Core compute for matrix operations on Hopper+.","unit":"%"},"global_load_frac":{"short":"Global Load Ratio","long":"Fraction of global memory ops that are loads (vs stores). Near 1.0 = read-dominated, near 0 = write-dominated.","unit":"ratio"},"shared_load_frac":{"short":"Shared Load Ratio","long":"Fraction of shared memory ops that are loads (vs stores).","unit":"ratio"},"sectors_per_global_inst":{"short":"Sectors/Global Inst","long":"Average 32B sectors per global memory instruction. Ideal coalescing for 4B per thread = 4 sectors/inst. Higher = poor coalescing.","unit":"sectors"},"wavefronts_per_shared_inst":{"short":"Wavefronts/Shared Inst","long":"Average L1TEX wavefronts per shared memory instruction. >1.0 indicates bank conflicts causing replay.","unit":"wavefronts"},"branch_avg_active_lanes":{"short":"Branch Avg Active Lanes","long":"Average active thread lanes during branch execution (CUPTI thread_inst/inst). 32 = no divergence.","unit":"lanes"},"branch_lane_utilization":{"short":"Branch Lane Utilization","long":"Fraction of possible lanes active during branches (avg_lanes/32). <1.0 = warp divergence at branches.","unit":"%"},"nvbit_vs_cupti":{"short":"NVBit vs CUPTI","long":"Cross-validation of instruction counts between NVBit (binary instrumentation) and CUPTI (hardware counters). Delta < 5% = good agreement. Larger delta indicates pc2region coverage gaps.","unit":""}},"dataQuality":{"nvbit_modes":[],"cupti_profiles":[],"has_trace":true,"has_pcsamp":false,"pcsamp_total":0,"has_instrexec":false,"instrexec_total":0,"has_locality":false,"has_source_mapping":false,"has_nsys":true,"nsys_kernel_count":1,"nsys_nccl_count":0},"lineRegions":{"51":1,"52":1,"53":1,"54":2,"55":2,"56":2,"57":2,"58":2,"59":2,"60":2,"61":1,"62":3,"63":3,"64":3,"65":3,"66":3,"67":3,"68":3,"69":1,"70":1,"71":1,"72":4,"73":4,"74":4,"75":4,"76":4,"77":4,"78":1,"79":1,"80":1,"81":5,"82":5,"83":5,"84":5,"85":5,"86":5,"87":5,"88":1,"89":1},"memTrace":null,"pc2src":{},"nsysEvents":{"source":"report.nsys-rep","gpu_events":{"kernels":[{"name":"gemm_kernel","demangled_name":"gemm_kernel(const float *, const float *, float *, int, int, int, int, intra_kernel_profiler::trace::GlobalBuffer)","mangled_name":"_Z11gemm_kernelPKfS0_PfiiiiN21intra_kernel_profiler5trace12GlobalBufferE","start_ns":1564580581,"end_ns":1565466888,"duration_ns":886307,"device_id":0,"stream_id":7,"correlation_id":127,"grid":[1024,1,1],"block":[256,1,1],"registers_per_thread":72,"static_shared_memory":8192,"dynamic_shared_memory":0}],"memcpy":[{"start_ns":1561523481,"end_ns":1562486332,"duration_ns":962851,"bytes":16777216,"kind":"HtoD","kind_id":1,"stream_id":7,"correlation_id":121},{"start_ns":1562552924,"end_ns":1563553696,"duration_ns":1000772,"bytes":16777216,"kind":"HtoD","kind_id":1,"stream_id":7,"correlation_id":122},{"start_ns":1565515016,"end_ns":1565519208,"duration_ns":4192,"bytes":32768,"kind":"DtoH","kind_id":2,"stream_id":7,"correlation_id":129},{"start_ns":1769538041,"end_ns":1804657286,"duration_ns":35119245,"bytes":536870912,"kind":"DtoH","kind_id":2,"stream_id":7,"correlation_id":130}]},"nccl":{"kernels":[]},"nvtx_ranges":[],"counts":{"kernels":1,"memcpy":4,"memset":2,"nccl_kernels":0,"nvtx_ranges":0}}};
// ── Load Data (switch kernel) ──
document.getElementById('loadBtn').addEventListener('click', () => document.getElementById('loadFile').click());
document.getElementById('loadFile').addEventListener('change', function(e) {
const file = e.target.files[0];
if (!file) return;
const reader = new FileReader();
reader.onload = function(ev) {
try {
const text = ev.target.result;
if (file.name.endsWith('.html')) {
// Open another explorer HTML in a new tab
const blob = new Blob([text], {type: 'text/html'});
const url = URL.createObjectURL(blob);
window.open(url, '_blank');
} else if (file.name.endsWith('.json')) {
// Standalone JSON data — wrap into a new explorer HTML
const newData = JSON.parse(text);
if (!newData.source) throw new Error('JSON must contain "source" key (use generate_explorer.py output)');
const newHtml = document.documentElement.outerHTML.replace(
/let D = .*?;\n/s,
'let D = ' + JSON.stringify(newData) + ';\n'
);
const blob = new Blob([newHtml], {type: 'text/html'});
window.open(URL.createObjectURL(blob), '_blank');
} else {
throw new Error('Unsupported file type. Use .html (explorer) or .json (data).');
}
} catch(err) {
alert('Load failed: ' + err.message +
'\n\nTo switch kernels, regenerate with:\n' +
' python3 scripts/generate_explorer.py \\\n' +
' --demo-dir <output_dir> \\\n' +
' --source <kernel.cu> \\\n' +
' --output explorer.html');
}
};
reader.readAsText(file);
});
// ── Helpers ──
function fmt(v) {
if (v===0||v==null) return '0';
if (typeof v==='number'&&!Number.isInteger(v)){
if(Math.abs(v)>=1e9)return(v/1e9).toFixed(2)+'G';
if(Math.abs(v)>=1e6)return(v/1e6).toFixed(2)+'M';
if(Math.abs(v)>=1e3)return(v/1e3).toFixed(2)+'K';
return v.toFixed(3);
}
v=Math.round(v);
if(Math.abs(v)>=1e9)return(v/1e9).toFixed(2)+'G';
if(Math.abs(v)>=1e6)return(v/1e6).toFixed(2)+'M';
if(Math.abs(v)>=1e3)return(v/1e3).toFixed(1)+'K';
return v.toString();
}
function fmtBytes(b) {
if (b==null||b===0) return '0 B';
if (b>=1073741824) return (b/1073741824).toFixed(2)+' GB';
if (b>=1048576) return (b/1048576).toFixed(2)+' MB';
if (b>=1024) return (b/1024).toFixed(1)+' KB';
return b+' B';
}
function fmtPct(v) { return v==null?'N/A':(v*100).toFixed(1)+'%'; }
function infoIcon(key) {
const d = D.defs && D.defs[key];
if (!d) return '';
const tip = (d.long||'').replace(/</g,'<').replace(/>/g,'>');
const unit = d.unit && d.unit !== 'histogram' && d.unit !== 'class' ? ' <b>['+d.unit+']</b>' : '';
return '<span class="info-icon" data-def="'+key+'" onclick="event.stopPropagation();showInfoTip(this)">i</span>';
}
const _tipEl = document.createElement('div');
_tipEl.className = 'info-tip';
_tipEl.style.display = 'none';
document.body.appendChild(_tipEl);
let _tipOwner = null;
function showInfoTip(el) {
if (_tipOwner === el) { _tipEl.style.display = 'none'; _tipOwner = null; return; }
const key = el.getAttribute('data-def');
const d = D.defs && D.defs[key];
if (!d) return;
const unit = d.unit && d.unit !== 'histogram' && d.unit !== 'class' ? ' <b>['+d.unit+']</b>' : '';
_tipEl.innerHTML = '<b>'+d.short+'</b><br>'+d.long+unit;
_tipEl.style.display = 'block';
const r = el.getBoundingClientRect();
let top = r.top - _tipEl.offsetHeight - 6;
let left = r.left + r.width/2 - _tipEl.offsetWidth/2;
if (top < 4) top = r.bottom + 6;
if (left < 4) left = 4;
if (left + _tipEl.offsetWidth > window.innerWidth - 4) left = window.innerWidth - _tipEl.offsetWidth - 4;
_tipEl.style.top = top + 'px';
_tipEl.style.left = left + 'px';
_tipOwner = el;
}
document.addEventListener('click', function(e) {
if (!e.target.closest('.info-icon')) { _tipEl.style.display = 'none'; _tipOwner = null; }
});
function barRow(label,value,maxVal,color,defKey){
const pct=maxVal>0?(value/maxVal*100):0;
const tip=defKey&&D.defs&&D.defs[defKey]?' title="'+D.defs[defKey].long.replace(/"/g,'"')+'"':'';
const d=D.defs&&D.defs[defKey];
const unit=d&&d.unit&&d.unit!=='histogram'&&d.unit!=='class'?' <span style="font-size:9px;color:var(--dim)">'+d.unit+'</span>':'';
return '<div class="bar-row"><span class="bar-label"'+tip+'>'+label+'</span>'+
'<div class="bar-track"><div class="bar-fill" style="width:'+pct+'%;background:'+(color||'var(--accent)')+'"></div></div>'+
'<span class="bar-val">'+fmt(value)+unit+'</span></div>';
}
// Card helper: value, label, defKey for info icon + unit
function card(val, label, defKey, opts) {
opts = opts || {};
const d = D.defs && D.defs[defKey];
const unit = d && d.unit && d.unit!=='histogram'&&d.unit!=='class' ? d.unit : '';
const isByte = defKey && (defKey.includes('bytes') || defKey === 'lmem_bytes' || defKey === 'smem_bytes' || defKey === 'gmem_bytes' || defKey === 'gmem_req_bytes' || defKey === 'smem_req_bytes' || defKey === 'lmem_req_bytes');
const fmtVal = opts.raw ? val : (isByte ? fmtBytes(val) : fmt(val));
const unitHtml = (!isByte && unit && !opts.noUnit && !String(fmtVal).includes('%')) ?
'<div style="font-size:8px;color:var(--dim);margin-top:1px">'+unit+'</div>' : '';
const style = opts.style ? ' style="'+opts.style+'"' : '';
return '<div class="card"><div class="cv"'+style+'>'+fmtVal+'</div>'+unitHtml+
'<div class="cl">'+label+(defKey?infoIcon(defKey):'')+'</div></div>';
}
function badgeFor(type) {
const m = {compute:'badge-compute',memory:'badge-memory',branch:'badge-branch',balanced:'badge-balanced'};
const tips = {
compute: 'Compute-bound: >30% of instructions are FP32 ALU or Tensor (WGMMA). Bottleneck = arithmetic pipeline throughput.',
memory: 'Memory-bound: >30% of instructions are global/shared loads or stores. Bottleneck = memory bandwidth or latency.',
branch: 'Branch-bound: >30% of instructions are branches. Bottleneck = control flow overhead.',
balanced: 'Balanced: no single category exceeds 30% of instructions. Workload is mixed or dominated by other op types.'
};
return '<span class="badge '+(m[type]||'badge-balanced')+'" title="'+(tips[type]||tips.balanced)+'">'+type+'-bound</span>';
}
function effBar(pct, label) {
const clr = pct>90?'var(--green)':pct>50?'var(--yellow)':'var(--red)';
return '<div class="eff-bar" style="background:var(--bg3)"><div class="eff-fill" style="width:'+pct+'%;background:'+clr+'"></div>'+
'<div class="eff-label">'+label+': '+pct.toFixed(1)+'%</div></div>';
}
function secToggle(id) {
const hdr=document.getElementById('sh_'+id);
const body=document.getElementById('sb_'+id);
if(hdr&&body){hdr.classList.toggle('collapsed');body.classList.toggle('collapsed');}
}
function sectionStart(id, title) {
return '<div class="msec"><div class="sec-hdr" id="sh_'+id+'" onclick="secToggle(\''+id+'\')">'+
'<span class="arrow">\u25BC</span><h4 style="border:none;margin:0;padding:0">'+title+'</h4></div>'+
'<div class="sec-body" id="sb_'+id+'" style="max-height:9999px">';
}
function sectionEnd() { return '</div></div>'; }
// Track ECharts instances for cleanup
const _charts = {};
function mkChart(id, h) {
if(_charts[id]){_charts[id].dispose();delete _charts[id];}
const el=document.getElementById(id);
if(!el)return null;
const c=echarts.init(el);
// Patch setOption to ensure all tooltips render on body (avoids clipping by overflow:hidden parents)
const _origSet=c.setOption.bind(c);
c.setOption=function(opt){
if(opt.tooltip) opt.tooltip=Object.assign({appendToBody:true,confine:true},opt.tooltip);
return _origSet(opt);
};
_charts[id]=c;
new ResizeObserver(()=>c.resize()).observe(el);
return c;
}
const INST_CLASS_COLORS = {
alu_fp32:'#1a7f37',alu_int:'#0969da',tensor_wgmma:'#8250df',
ld_global:'#bc4c00',st_global:'#e16f24',ld_shared:'#0550ae',st_shared:'#368cf9',
ld_local:'#9a6700',st_local:'#bf8700',barrier:'#cf222e',membar:'#da3633',
branch:'#6639ba',call:'#8b949e',ret:'#afb8c1',special:'#57606a',other:'#d0d7de'
};
// ── Header ──
document.getElementById('fileName').textContent = D.source.path;
const infoParts = [];
if (Object.keys(D.perLine).length) infoParts.push('<b>'+Object.keys(D.perLine).length+'</b> annotated lines');
if (Object.keys(D.regions).length) infoParts.push('<b>'+Object.keys(D.regions).length+'</b> regions');
if (D.sass.text) infoParts.push('SASS');
if (D.ptx.text) infoParts.push('PTX');
if (D.trace) infoParts.push('Trace');
if (D.sassProfiles && Object.keys(D.sassProfiles).length) infoParts.push('<b>'+Object.keys(D.sassProfiles).length+'</b> SASS profiles');
document.getElementById('headerInfo').innerHTML = infoParts.join(' <span class="sep">|</span> ');
// ── Monaco Setup ──
const MCDN = "https://cdn.jsdelivr.net/npm/monaco-editor@0.52.0/min";
require.config({ paths: { vs: MCDN + "/vs" } });
window.MonacoEnvironment = {
getWorkerUrl(_,__) {
return `data:text/javascript;charset=utf-8,${encodeURIComponent(
`self.MonacoEnvironment={baseUrl:"${MCDN}/"};importScripts("${MCDN}/vs/base/worker/workerMain.js");`
)}`;
}
};
require(["vs/editor/editor.main"], function () {
document.getElementById('loading').style.display = 'none';
// ── Languages ──
monaco.languages.register({ id: 'cuda' });
monaco.languages.setMonarchTokensProvider('cuda', {
keywords: ['void','int','float','double','char','unsigned','long','short','bool','const',
'static','extern','inline','constexpr','if','else','for','while','do','return','break',
'continue','struct','class','template','typename','namespace','using','true','false',
'nullptr','auto','sizeof','typedef','switch','case','default','enum','volatile'],
cudaKw: ['__global__','__device__','__host__','__shared__','__constant__','__restrict__',
'__launch_bounds__','__forceinline__','blockIdx','blockDim','threadIdx','gridDim',
'warpSize','atomicAdd','atomicCAS','__syncthreads','__syncwarp',
'__shfl_sync','__shfl_xor_sync','__shfl_up_sync','__shfl_down_sync'],
types: ['dim3','uint32_t','int32_t','uint64_t','int64_t','size_t','float2','float4',
'int2','int4','cudaError_t','cudaStream_t'],
tokenizer: { root: [
[/\/\/.*$/, 'comment'], [/\/\*/, 'comment', '@comment'],
[/"[^"]*"/, 'string'], [/'[^']*'/, 'string'],
[/#\w+/, 'keyword.directive'],
[/\bIKP_\w+\b/, 'keyword.macro'],
[/\b\d[\d.]*[fFeEuUlL]*\b/, 'number'],
[/0x[0-9a-fA-F]+/, 'number.hex'],
[/[a-zA-Z_]\w*/, { cases: { '@keywords':'keyword', '@cudaKw':'keyword.cuda', '@types':'type', '@default':'identifier' }}],
[/<<</, 'delimiter.cuda'], [/>>>/, 'delimiter.cuda'],
], comment: [[/\*\//, 'comment', '@pop'], [/./, 'comment']] }
});
monaco.languages.register({ id: 'sass-asm' });
monaco.languages.setMonarchTokensProvider('sass-asm', {
fp:['FFMA','FMUL','FADD','HMMA','WGMMA','HFMA2','DFMA','MUFU'],
mem:['LDG','STG','LDS','STS','LDC','LDSM','LDGSTS','ATOMS','ATOMG'],
ctrl:['BRA','EXIT','BSSY','BSYNC','RET','CALL','BAR','YIELD','WARPSYNC','NANOSLEEP'],
intop:['IMAD','IADD','ISETP','S2R','MOV','SEL','SHF','PRMT','LEA','LOP3','IMNMX','SGXT'],
tokenizer: { root: [
[/\/\/.*$/, 'comment'],
[/\/\*[0-9a-fA-F]+\*\//, 'number.hex'],
[/\b[A-Z][A-Z0-9.]+\b/, { cases: { '@fp':'keyword.fp', '@mem':'keyword.mem', '@ctrl':'keyword.ctrl', '@intop':'keyword.int', '@default':'keyword' }}],
[/R\d+/, 'variable'], [/P\d+/, 'variable'], [/UR\d+/, 'variable'], [/UP\d+/, 'variable'],
[/0x[0-9a-fA-F]+/, 'number.hex'], [/\b\d+\b/, 'number'],
] }
});
monaco.languages.register({ id: 'ptx' });
monaco.languages.setMonarchTokensProvider('ptx', {
tokenizer: { root: [
[/\/\/.*$/, 'comment'],
[/\.(version|target|address_size|visible|entry|func|global|local|shared|const|param|reg|pred)\b/, 'keyword.directive'],
[/\.(s8|s16|s32|s64|u8|u16|u32|u64|f16|f32|f64|b8|b16|b32|b64)\b/, 'type'],
[/\b(ld|st|mov|add|mul|mad|sub|div|rem|and|or|xor|not|shl|shr|setp|selp|bra|ret|exit|bar|atom|red|cvt|abs|neg|min|max|fma|rcp|sqrt)\b/, 'keyword'],
[/%\w+/, 'variable'], [/0x[0-9a-fA-F]+/, 'number.hex'], [/\b\d+\b/, 'number'], [/"[^"]*"/, 'string'],
] }
});
// ── Theme ──
monaco.editor.defineTheme('ikp-light', {
base: 'vs', inherit: true,
rules: [
{ token: 'keyword.cuda', foreground: '0550ae', fontStyle: 'bold' },
{ token: 'keyword.macro', foreground: 'bc4c00', fontStyle: 'bold' },
{ token: 'keyword.directive', foreground: '8250df' },
{ token: 'type', foreground: '0550ae' },
{ token: 'keyword.fp', foreground: '1a7f37', fontStyle: 'bold' },
{ token: 'keyword.mem', foreground: 'bc4c00' },
{ token: 'keyword.ctrl', foreground: '8250df' },
{ token: 'keyword.int', foreground: '0969da' },
{ token: 'number.hex', foreground: '656d76' },
{ token: 'variable', foreground: '0550ae' },
],
colors: {
'editor.background': '#ffffff',
'editor.lineHighlightBackground': '#f6f8fa00',
'editorGutter.background': '#f6f8fa',
'editorLineNumber.foreground': '#656d76',
'editor.selectionBackground': '#b6d5f2',
'minimap.background': '#f6f8fa',
}
});
// ── Editor options ──
const viewerOpts = {
readOnly: true, domReadOnly: true, theme: 'ikp-light',
minimap: { enabled: true, renderCharacters: false, scale: 1 },
scrollBeyondLastLine: false, lineNumbersMinChars: 3,
glyphMargin: false, folding: false, contextmenu: false, links: false,
quickSuggestions: false, suggestOnTriggerCharacters: false,
parameterHints: { enabled: false }, renderValidationDecorations: 'off',
matchBrackets: 'never', occurrencesHighlight: 'off', selectionHighlight: false,
overviewRulerLanes: 3, scrollbar: { verticalScrollbarSize: 8, horizontalScrollbarSize: 8 },
fontSize: 13, lineHeight: 20, fontFamily: "'JetBrains Mono','Fira Code','SF Mono',Consolas,monospace",
};
// ── Create editors (source + PTX + SASS) ──
const srcEditor = monaco.editor.create(document.getElementById('srcWrap'), {
...viewerOpts, value: D.source.code, language: 'cuda', lineDecorationsWidth: 5,
glyphMargin: true, glyphMarginWidth: 28,
});
const ptxEditor = monaco.editor.create(document.getElementById('ptxWrap'), {
...viewerOpts, value: D.ptx.text || '// No PTX data', language: 'ptx',
});
const sassEditor = monaco.editor.create(document.getElementById('sassWrap'), {
...viewerOpts, value: D.sass.text || '// No SASS data', language: 'sass-asm',
});
// ── Build reverse maps (ASM line → source line) ──
const _sassToSrc = {};
for (const [srcLine, sassLines] of Object.entries(D.sass.lineMap||{})) {
for (const sl of sassLines) _sassToSrc[sl] = parseInt(srcLine);
}
const _ptxToSrc = {};
for (const [srcLine, ptxLines] of Object.entries(D.ptx.lineMap||{})) {
for (const pl of ptxLines) _ptxToSrc[pl] = parseInt(srcLine);
}
// ── Inject dynamic CSS for region glyph labels ──
const _glyphStyle = document.createElement('style');
let _glyphCss = '';
for (const [rid, lbl] of Object.entries(D.labels)) {
// Abbreviate: up to 4 chars
const abbr = lbl.length <= 4 ? lbl : lbl.replace(/[aeiou_]/gi,'').substring(0,4) || lbl.substring(0,4);
const c = D.colors[rid] || '#8b949e';
_glyphCss += `.rg-${rid}::after{content:'${abbr}';color:${c};}`;
}
_glyphStyle.textContent = _glyphCss;
document.head.appendChild(_glyphStyle);
// ── Build region nesting map (parent region for each line) ──
const _regionNesting = {}; // rid -> parent_rid
{
const LR = D.lineRegions || {};
const lines = Object.keys(LR).map(Number).sort((a,b)=>a-b);
// Track which regions contain which others based on line coverage
const regionLines = {};
for (const [ln, rid] of Object.entries(LR)) {
if (!regionLines[rid]) regionLines[rid] = [];
regionLines[rid].push(parseInt(ln));
}
for (const [rid, rlines] of Object.entries(regionLines)) {
const rmin = Math.min(...rlines), rmax = Math.max(...rlines);
for (const [pid, plines] of Object.entries(regionLines)) {
if (pid === rid) continue;
const pmin = Math.min(...plines), pmax = Math.max(...plines);
if (pmin <= rmin && pmax >= rmax && plines.length > rlines.length) {
// pid contains rid — pid is parent
if (!_regionNesting[rid] || regionLines[_regionNesting[rid]].length > plines.length) {
_regionNesting[rid] = parseInt(pid); // choose smallest containing parent
}
}
}
}
}
// ── Source decorations ──
const srcDecos = [];
const srcModel = srcEditor.getModel();
const LR = D.lineRegions || {}; // line -> region_id (from source parsing)
for (let i = 1; i <= srcModel.getLineCount(); i++) {
const info = D.perLine[i];
const lineRegion = LR[i]; // region from source code parsing
if (!info && lineRegion == null) continue;
// Determine the effective region for this line
const rid = (lineRegion != null) ? lineRegion : (info ? info.region : null);
const inRegion = (rid != null);
let cls = '';
let hoverMd = `**Line ${i}**`;
const opts = { isWholeLine: true };
if (info) {
// Only apply heat-based background highlighting for lines INSIDE a region
if (inRegion) {
if (info.heat > 0.8) cls = 'line-hot';
else if (info.heat > 0.3) cls = 'line-warm';
else if (info.heat > 0.01) cls = 'line-cold';
}
const ie = info.m['smsp__sass_inst_executed'] || 0;
const te = info.m['smsp__sass_thread_inst_executed'] || 0;
const rlabel = inRegion ? (D.labels[rid] || 'region_'+rid) : null;
if (rlabel) hoverMd += ` \u2014 *${rlabel}*`;
// Show parent region if nested
if (rid != null && _regionNesting[rid] != null) {
const plab = D.labels[_regionNesting[rid]] || 'region_'+_regionNesting[rid];
hoverMd += ` (inside *${plab}*)`;
}
hoverMd += `\n\nInstructions: **${fmt(ie)}** | Hotness: **${(info.heat*100).toFixed(1)}%**`;
if (te > 0 && ie > 0) hoverMd += `\n\nActive threads: ${(te/(ie*32)*100).toFixed(1)}%`;
hoverMd += `\n\n${info.pcs} PCs | ${info.profiles.join(', ')}`;
if (inRegion) {
opts.overviewRuler = info.heat > 0 ? {
color: info.heat > 0.8 ? '#cf222e' : info.heat > 0.3 ? '#9a6700' : '#0969da',
position: monaco.editor.OverviewRulerLane.Full
} : undefined;
opts.minimap = info.heat > 0 ? {
color: info.heat > 0.8 ? '#cf222e' : info.heat > 0.3 ? '#9a6700' : '#0969da',
position: monaco.editor.MinimapPosition.Inline
} : undefined;
}
} else if (lineRegion != null) {
// Line inside a region but no CUPTI data — show region bar + hover
const rlabel = D.labels[lineRegion] || 'region_'+lineRegion;
hoverMd += ` \u2014 *${rlabel}*`;
if (_regionNesting[lineRegion] != null) {
const plab = D.labels[_regionNesting[lineRegion]] || 'region_'+_regionNesting[lineRegion];
hoverMd += ` (inside *${plab}*)`;
}
}
opts.className = cls;
opts.hoverMessage = [{ value: hoverMd }];
// Add region gutter bar + glyph label for lines inside a profiling region
if (rid != null) {
opts.linesDecorationsClassName = 'region-bar region-bar-'+rid;
opts.glyphMarginClassName = 'region-glyph rg-'+rid;
}
srcDecos.push({ range: new monaco.Range(i,1,i,1), options: opts });
}
srcEditor.createDecorationsCollection(srcDecos);
// ── Source legend (region color map + heat legend) ──
{
const leg = document.getElementById('srcLegend');
let lh = '';
// Region colors
const rids = Object.keys(D.labels).map(Number).sort();
for (const rid of rids) {
const c = D.colors[rid] || '#8b949e';
lh += '<span class="leg-item" onclick="window._selectRegion('+rid+');switchTab(\'region\')" title="Click to show region '+D.labels[rid]+'">'+
'<span class="leg-dot" style="background:'+c+'"></span>'+D.labels[rid]+'</span>';
}
// Heat legend
lh += '<span style="margin-left:8px;border-left:1px solid var(--border);padding-left:8px">Heat:</span>';
lh += '<span class="leg-item"><span class="leg-heat" style="background:rgba(207,34,46,.10)"></span>hot</span>';
lh += '<span class="leg-item"><span class="leg-heat" style="background:rgba(154,103,0,.08)"></span>warm</span>';
lh += '<span class="leg-item"><span class="leg-heat" style="background:rgba(9,105,218,.06)"></span>cold</span>';
leg.innerHTML = lh;
}
// ── Bidirectional cross-highlighting (Source ↔ PTX ↔ SASS) ──
let selectedLine = null;
let _crossLock = false; // prevent recursive triggers
let sassDecoCollection = sassEditor.createDecorationsCollection([]);
let ptxDecoCollection = ptxEditor.createDecorationsCollection([]);
let srcSelectCollection = srcEditor.createDecorationsCollection([]);
function highlightFromSource(srcLine, opts) {
opts = opts || {};
const ptxStatus = document.getElementById('ptxStatus');
const sassStatus = document.getElementById('sassStatus');
// PTX
const ptxIndices = (D.ptx.lineMap||{})[srcLine] || [];
if (ptxIndices.length) {
ptxDecoCollection.set(ptxIndices.map(ln => ({
range: new monaco.Range(ln,1,ln,1),
options: { isWholeLine: true, className: 'asm-highlight' }
})));
if (!opts.noPtxScroll) ptxEditor.revealLineInCenter(ptxIndices[0]);
if (ptxStatus) ptxStatus.textContent = ptxIndices.length + ' line' + (ptxIndices.length>1?'s':'') + ' (L' + srcLine + ')';
} else {
ptxDecoCollection.set([]);
if (ptxStatus) ptxStatus.textContent = 'L' + srcLine + ': no PTX mapping';
}
// SASS
const sassIndices = (D.sass.lineMap||{})[srcLine] || [];
if (sassIndices.length) {
sassDecoCollection.set(sassIndices.map(ln => ({
range: new monaco.Range(ln,1,ln,1),
options: { isWholeLine: true, className: 'asm-highlight' }
})));
if (!opts.noSassScroll) sassEditor.revealLineInCenter(sassIndices[0]);
if (sassStatus) sassStatus.textContent = sassIndices.length + ' line' + (sassIndices.length>1?'s':'') + ' (L' + srcLine + ')';
} else {
sassDecoCollection.set([]);
if (sassStatus) sassStatus.textContent = 'L' + srcLine + ': no SASS mapping';
}
}
// Click source → highlight PTX + SASS
srcEditor.onMouseDown(e => {
if (e.target.position && !_crossLock) selectLine(e.target.position.lineNumber);
});
function selectLine(n) {
selectedLine = n;
srcSelectCollection.set([{
range: new monaco.Range(n,1,n,1),
options: { isWholeLine: true, className: 'line-selected' }
}]);
highlightFromSource(n);
buildLineMetrics(n);
switchTab('line');
}
// Click PTX → highlight source + SASS
ptxEditor.onMouseDown(e => {
if (!e.target.position || _crossLock) return;
const ptxLine = e.target.position.lineNumber;
const srcLine = _ptxToSrc[ptxLine];
if (srcLine) {
_crossLock = true;
selectedLine = srcLine;
srcSelectCollection.set([{
range: new monaco.Range(srcLine,1,srcLine,1),
options: { isWholeLine: true, className: 'line-selected' }
}]);
srcEditor.revealLineInCenter(srcLine);
highlightFromSource(srcLine, {noPtxScroll: true});
buildLineMetrics(srcLine);
switchTab('line');
_crossLock = false;
}
});
// Click SASS → highlight source + PTX
sassEditor.onMouseDown(e => {
if (!e.target.position || _crossLock) return;
const sassLine = e.target.position.lineNumber;
const srcLine = _sassToSrc[sassLine];
if (srcLine) {
_crossLock = true;
selectedLine = srcLine;
srcSelectCollection.set([{
range: new monaco.Range(srcLine,1,srcLine,1),
options: { isWholeLine: true, className: 'line-selected' }
}]);
srcEditor.revealLineInCenter(srcLine);
highlightFromSource(srcLine, {noSassScroll: true});
buildLineMetrics(srcLine);
switchTab('line');
_crossLock = false;
}
});
// ── Split.js ──
const hasSass = !!(D.sass.text);
const hasPtx = !!(D.ptx.text);
const hasAsm = hasSass || hasPtx;
function layoutAll() { srcEditor.layout(); ptxEditor.layout(); sassEditor.layout(); }
Split(['#srcPanel', '#asmPanel', '#metPanel'], {
sizes: hasAsm ? [35, 30, 35] : [55, 0, 45],
minSize: [200, hasAsm ? 150 : 0, 250],
gutterSize: 8,
onDrag: layoutAll,
onDragEnd: layoutAll,
});
// Vertical split inside asmPanel: PTX top, SASS bottom
if (hasAsm) {
Split(['#ptxPane', '#sassPane'], {
sizes: hasPtx && hasSass ? [50, 50] : (hasPtx ? [100, 0] : [0, 100]),
minSize: [0, 0],
gutterSize: 6,
direction: 'vertical',
onDrag: layoutAll,
onDragEnd: layoutAll,
});
}
if (!hasPtx) document.getElementById('ptxPane').style.display = 'none';
if (!hasSass) document.getElementById('sassPane').style.display = 'none';
setTimeout(layoutAll, 50);
window.addEventListener('resize', layoutAll);
// ── Metric tabs ──
const tabBuilders = {
ov: buildOverview, line: ()=>{}, region: ()=>{}, exec: buildExecution,
mem: buildMemory, stalls: buildStalls, sys: buildSystem, trace: buildTrace
};
let builtTabs = {};
function switchTab(name) {
document.querySelectorAll('#tabBar .tab').forEach(t => t.classList.toggle('active', t.dataset.t === name));
document.querySelectorAll('.tc').forEach(t => t.classList.toggle('active', t.id === 'tc-' + name));
// Reset scroll position so new tab starts at top
const ms = document.querySelector('.mscroll');
if (ms) ms.scrollTop = 0;
if (!builtTabs[name] && tabBuilders[name]) { tabBuilders[name](); builtTabs[name]=true; }
}
document.getElementById('tabBar').addEventListener('click', e => {
const tab = e.target.closest('.tab');
if (tab) switchTab(tab.dataset.t);
});
// ══════════════════════════════════════════════════════════════════
// TAB 1: OVERVIEW
// ══════════════════════════════════════════════════════════════════
function buildOverview() {
const ct = document.getElementById('ovCt');
const rids = Object.keys(D.regions).map(Number).sort();
let h = '';
// Summary cards
let totalInst=0, totalGmem=0, totalSmem=0, totalLmem=0;
for (const r of Object.values(D.regions)) {
totalInst += r.inst_total||0;
totalGmem += r.gmem_bytes||0;
totalSmem += r.smem_bytes||0;
totalLmem += r.lmem_bytes||0;
}
h += '<div class="cards cards-3">';
h += card(totalInst, 'Total Instructions', 'inst_total');
h += card(rids.length, 'Active Regions', null, {raw:true,noUnit:true});
const bottlenecks = rids.map(rid=>D.regions[rid]?.derived?.bottleneck).filter(Boolean);
const topBn = bottlenecks.length ? bottlenecks.sort((a,b)=>bottlenecks.filter(x=>x===b).length-bottlenecks.filter(x=>x===a).length)[0] : 'balanced';
h += '<div class="card"><div class="cv" style="font-size:14px">'+badgeFor(topBn)+'</div><div class="cl">Bottleneck'+infoIcon('bottleneck')+'</div></div>';
h += card(totalGmem+totalSmem+totalLmem, 'Total Memory', 'gmem_bytes');
if (D.trace) {
const total = D.trace.find(r=>r.name==='total');
h += card(total?total.count:D.trace.reduce((s,r)=>s+r.count,0), 'Trace Events', null, {noUnit:true});
} else {
h += '<div class="card"><div class="cv" style="color:var(--dim)">N/A</div><div class="cl">Trace Events</div></div>';
}
const nSources = (D.dataQuality.nvbit_modes.length>0?1:0) + (D.dataQuality.cupti_profiles.length>0?1:0) +
(D.dataQuality.has_trace?1:0) + (D.dataQuality.has_pcsamp?1:0) + (D.dataQuality.has_locality?1:0);
h += card(nSources, 'Data Sources', null, {raw:true,noUnit:true});
h += '</div>';
// Radar chart (region comparison)
if (rids.length > 0) {
h += sectionStart('ov_radar','Region Comparison');
h += '<div style="font-size:10px;color:var(--dim);margin-bottom:4px;line-height:1.5">'+
'Compares instruction mix across regions. Each axis is a <b>fraction of total instructions</b> in that region:<br>'+
'<b>FP32%</b> = ALU FP32 ops, <b>Memory%</b> = global+shared load/store ops, '+
'<b>Branch%</b> = branch instructions, <b>Divergence%</b> = 1 \u2212 branch uniformity, '+
'<b>Pred-off%</b> = predicated-off instruction fraction.</div>';
h += '<div id="ovRadar" class="chart" style="height:300px;"></div>';
h += sectionEnd();
}
// Data quality table
h += sectionStart('ov_dq','Data Quality');
h += '<table class="ptable" style="text-align:left"><tr><th style="text-align:left">Source</th><th style="text-align:left">Status</th><th>Records</th></tr>';
const dq = D.dataQuality;
const dqRow = (name,present,count) => '<tr><td style="text-align:left">'+name+'</td><td style="text-align:left">'+
(present?'<span class="badge badge-present">Present</span>':'<span class="badge badge-miss">Missing</span>')+
'</td><td>'+fmt(count)+'</td></tr>';
h += dqRow('NVBit Regions', Object.keys(D.regions).length>0, Object.keys(D.regions).length);
h += dqRow('SASS Profiles', dq.cupti_profiles.length>0, dq.cupti_profiles.length);
h += dqRow('PC Sampling', dq.has_pcsamp, dq.pcsamp_total);
h += dqRow('InstrExec', dq.has_instrexec, dq.instrexec_total);
h += dqRow('Trace', dq.has_trace, D.trace?D.trace.reduce((s,r)=>s+r.count,0):0);
h += dqRow('Locality', dq.has_locality, D.locality?Object.keys(D.locality.regions).length:0);
h += dqRow('Source Mapping', dq.has_source_mapping, Object.keys(D.perLine).length);
h += dqRow('CUPTI\u00D7NVBit Cross-ref', D.crossValidation && Object.keys(D.crossValidation).length>0,
D.crossValidation ? Object.keys(D.crossValidation).length : 0);
if (dq.cupti_profiles.length) h += '<tr><td style="text-align:left;color:var(--dim)" colspan=3>Profiles: '+dq.cupti_profiles.join(', ')+'</td></tr>';
if (dq.nvbit_modes.length) h += '<tr><td style="text-align:left;color:var(--dim)" colspan=3>NVBit modes: '+dq.nvbit_modes.join(', ')+'</td></tr>';
h += '</table>';
// Cross-validation summary: instruction mix fraction agreement
if (D.crossValidation && Object.keys(D.crossValidation).length > 0) {
h += '<div style="margin-top:6px;font-size:10px;font-weight:600;color:var(--bright)">NVBit \u00d7 CUPTI Instruction Mix Agreement'+infoIcon('nvbit_vs_cupti')+'</div>';
h += '<div style="font-size:9px;color:var(--dim);margin-bottom:4px">Compares compute/memory/branch instruction fractions between tools. Lower delta = better agreement.</div>';
h += '<table class="ptable" style="text-align:left;font-size:10px;margin-top:4px">';
h += '<tr><th style="text-align:left">Region</th><th style="text-align:right">Compute \u0394</th><th style="text-align:right">GlobMem \u0394</th><th style="text-align:right">SharedMem \u0394</th><th style="text-align:right">Branch \u0394</th><th style="text-align:right">Avg \u0394</th><th style="text-align:left">Status</th></tr>';
for (const [cvRid, cv] of Object.entries(D.crossValidation).sort((a,b)=>Number(a[0])-Number(b[0]))) {
const label = D.labels[cvRid] || 'region_'+cvRid;
const color = D.colors[cvRid] || '#656d76';
const avgD = cv.avg_delta||0;
const avgColor = avgD < 0.02 ? 'var(--green)' : avgD < 0.1 ? 'var(--orange)' : 'var(--red)';
h += '<tr>';
h += '<td style="text-align:left"><span style="color:'+color+'">\u25CF</span> '+label+'</td>';
for (const cat of ['compute','global_mem','shared_mem','branch']) {
const f = cv.fractions && cv.fractions[cat];
if (f) {
const dc = f.delta < 0.02 ? 'var(--green)' : f.delta < 0.1 ? 'var(--orange)' : 'var(--red)';
h += '<td style="text-align:right;color:'+dc+'">'+(f.delta*100).toFixed(1)+'pp</td>';
} else {
h += '<td style="text-align:right;color:var(--dim)">-</td>';
}
}
h += '<td style="text-align:right;color:'+avgColor+';font-weight:600">'+(avgD*100).toFixed(1)+'pp</td>';
h += '<td style="text-align:left;color:'+avgColor+';font-size:9px">'+(avgD<0.02?'Excellent':avgD<0.1?'Good':'Investigate')+'</td>';
h += '</tr>';
}
h += '</table>';
}
h += sectionEnd();
// Bottleneck hints
if (rids.length) {
h += sectionStart('ov_bn','Bottleneck Hints');
h += '<div style="font-size:9px;color:var(--dim);margin-bottom:6px;line-height:1.5">'+
'Classification: <b>compute_frac</b> = (alu_fp32 + tensor_wgmma) / inst_total, '+
'<b>memory_frac</b> = (ld/st_global + ld/st_shared) / inst_total, '+
'<b>branch_frac</b> = branch / inst_total. '+
'Largest fraction > 30% \u2192 that type; otherwise balanced.</div>';
for (const rid of rids) {
const r = D.regions[rid];
if (!r) continue;
const label = r.label || 'region_'+rid;
const bn = r.derived?.bottleneck || 'balanced';
const ic = r.inst_class || {};
const it = r.inst_total || 1;
const computeFrac = ((ic.alu_fp32||0)+(ic.tensor_wgmma||0))/it*100;
const memFrac = ((ic.ld_global||0)+(ic.st_global||0)+(ic.ld_shared||0)+(ic.st_shared||0))/it*100;
const branchFrac = (ic.branch||0)/it*100;
let hint = label + ': ' + fmt(r.inst_total||0) + ' instr';
hint += ' (compute=' + computeFrac.toFixed(1) + '%, memory=' + memFrac.toFixed(1) + '%, branch=' + branchFrac.toFixed(1) + '%)';
if ((r.gmem_bytes||0)>0) hint += ', ' + fmtBytes(r.gmem_bytes) + ' gmem';
if ((r.reg_spill_suspected||0)>0) hint += ', SPILL DETECTED';
h += '<div style="font-size:11px;margin-bottom:4px;display:flex;align-items:center;gap:6px">'+
'<span style="color:'+(D.colors[rid]||'var(--dim)')+'">\u25CF</span>'+
badgeFor(bn)+' <span style="color:var(--dim)">'+hint+'</span></div>';
}
h += sectionEnd();
}
// Tips
h += '<div class="msec"><h4>Tips</h4><div style="font-size:11px;color:var(--dim);line-height:1.8">';
h += 'Click source line \u2192 see metrics + SASS highlight<br>';
h += 'Click region \u2192 highlight all lines + full analysis<br>';
h += 'Drag panel borders \u2192 resize<br>';
h += 'SASS/PTX tabs \u2192 switch assembly view<br>';
h += 'Click <b style="font-style:italic;border:1px solid var(--dim);border-radius:50%;width:11px;height:11px;display:inline-flex;align-items:center;justify-content:center;font-size:8px">i</b> icons \u2192 metric descriptions';
h += '</div></div>';
ct.innerHTML = h;
// Radar chart — use dynamic axis max and filter empty regions
if (rids.length > 0) {
const el = document.getElementById('ovRadar');
if (el) {
const ch = mkChart('ovRadar');
const axes = ['FP32%','Memory%','Branch%','Divergence%','Pred-off%'];
const series = [];
const colors = ['#0969da','#1a7f37','#bc4c00','#8250df','#9a6700','#cf222e','#6639ba','#656d76'];
// Compute per-axis max for dynamic scaling
const axisMax = [0,0,0,0,0];
const rawData = [];
for (const rid of rids) {
const r = D.regions[rid];
if (!r || !r.inst_total) continue;
// Skip region 0 (_outside/prologue) if label starts with underscore
if ((r.label||'').startsWith('_')) continue;
const ic = r.inst_class||{};
const it = r.inst_total;
const vals = [
(ic.alu_fp32||0)/it,
((ic.ld_global||0)+(ic.st_global||0)+(ic.ld_shared||0)+(ic.st_shared||0))/it,
(ic.branch||0)/it,
r.derived?.branch_uniformity!=null ? 1-r.derived.branch_uniformity : 0,
r.derived?.predication_rate||0
];
vals.forEach((v,i)=>{if(v>axisMax[i])axisMax[i]=v;});
rawData.push({rid, r, vals});
}
// Dynamic max: round up to nearest nice value, minimum 0.05 so chart isn't invisible
const niceMax = axisMax.map(m => {
if (m <= 0.001) return 0.05;
if (m <= 0.05) return 0.1;