[bench] wvSplitK: per-kernel timing inside captured CUDA graph

mgehre-amd · mgehre-amd · commit c52ff9e9e831 · 2026-06-02T04:06:50.000-06:00
Replaces the whole-replay event timing with per-kernel timing recorded
inside the captured graph. PyTorch's torch.cuda.Event blocks the path
that records queryable timestamps inside graph capture on ROCm
(TORCH_CHECK(!external_) in c10/cuda/CUDAEvent.h, AIESW-34641), so the
bench drops to a small ctypes shim that calls
hipEventRecordWithFlags(hipEventRecordExternal) and hipEventElapsedTime
directly on the raw hipEvent_t handle pulled out of
torch.cuda.Event.cuda_event.

Capture layout uses a single event chain of iters_per_replay+1 events
(one per kernel boundary) rather than 2*iters_per_replay start/end
pairs, halving the event count in the graph for the same number of
per-kernel samples.

Reduces run-to-run noise from 0.88% to 0.51% median (6.27% -&gt; 2.54% max)
across the full 76-cell sweep, and ~45 s wall per run. Numbers match
the in-model profile better than the previous whole-replay median.

The ctypes shim can be removed once PyTorch upstream lifts the ROCm
external-event guard.

Signed-off-by: Matthias Gehre &lt;matthias.gehre@amd.com&gt;
diff --git a/tests/kernels/quantization/bench_rocm_skinny_gemm.py b/tests/kernels/quantization/bench_rocm_skinny_gemm.py
@@ -7,6 +7,10 @@
 at batch sizes 1-4. Validates accuracy against torch.mm. Dynamically
 determines iteration count per shape based on IQR convergence.
 
+Per-kernel timestamps are recorded inside a captured CUDA graph via a small
+ctypes shim that calls hipEventRecordWithFlags(hipEventRecordExternal) —
+PyTorch's high-level torch.cuda.Event blocks this path on ROCm (AIESW-34641).
+
 Usage:
     python tests/kernels/quantization/bench_rocm_skinny_gemm.py
     python tests/kernels/quantization/bench_rocm_skinny_gemm.py --dtype bf16
@@ -15,7 +19,9 @@
 """
 
 import argparse
+import ctypes
 import math
+import os
 import time
 
 import torch
@@ -27,6 +33,68 @@
 # Use a conservative estimate to ensure we bust L3.
 CACHE_SIZE_BYTES = 64 * 1024 * 1024
 
+
+# ---------------------------------------------------------------------------
+# HIP ctypes shim — workaround for PyTorch's blanket disable of
+# cudaEventRecordExternal on ROCm (see AIESW-34641). Lets us record per-kernel
+# events inside a captured CUDA graph and read back queryable timestamps.
+# Remove once PyTorch upstream lifts the TORCH_CHECK in c10/cuda/CUDAEvent.h.
+# ---------------------------------------------------------------------------
+HIP_EVENT_RECORD_EXTERNAL = 0x01
+
+
+def _load_hip():
+    site = os.path.dirname(os.path.dirname(torch.__file__))
+    for sub in ("_rocm_sdk_core/lib", "_rocm_sdk_devel/lib"):
+        for name in ("libamdhip64.so.7", "libamdhip64.so"):
+            p = os.path.join(site, sub, name)
+            if os.path.exists(p):
+                lib = ctypes.CDLL(p)
+                lib.hipEventRecordWithFlags.argtypes = [
+                    ctypes.c_void_p,
+                    ctypes.c_void_p,
+                    ctypes.c_uint,
+                ]
+                lib.hipEventRecordWithFlags.restype = ctypes.c_int
+                lib.hipEventElapsedTime.argtypes = [
+                    ctypes.POINTER(ctypes.c_float),
+                    ctypes.c_void_p,
+                    ctypes.c_void_p,
+                ]
+                lib.hipEventElapsedTime.restype = ctypes.c_int
+                return lib
+    raise RuntimeError("libamdhip64 not found under torch site-packages")
+
+
+_HIP = _load_hip()
+
+
+def _record_external(ev: torch.cuda.Event, stream) -> None:
+    """Record `ev` on `stream` with hipEventRecordExternal (graph-safe)."""
+    err = _HIP.hipEventRecordWithFlags(
+        int(ev.cuda_event), int(stream.cuda_stream), HIP_EVENT_RECORD_EXTERNAL
+    )
+    if err != 0:
+        raise RuntimeError(f"hipEventRecordWithFlags returned {err}")
+
+
+def _elapsed_ms(start_ev: torch.cuda.Event, end_ev: torch.cuda.Event) -> float:
+    ms = ctypes.c_float(-1.0)
+    err = _HIP.hipEventElapsedTime(
+        ctypes.byref(ms), int(start_ev.cuda_event), int(end_ev.cuda_event)
+    )
+    if err != 0:
+        raise RuntimeError(f"hipEventElapsedTime returned {err}")
+    return ms.value
+
+
+def _make_event():
+    """Create a timing event and force lazy hipEventCreate by recording once."""
+    e = torch.cuda.Event(enable_timing=True)
+    e.record()
+    return e
+
+
 SHAPES = [
     # Qwen3-4B / Qwen3-VL-4B (identical backbone)
     (6144, 2560, "Qwen3-4B qkv"),
@@ -71,19 +139,20 @@ def _median_se(times_sorted):
 def bench_dynamic(
     fn,
     target_se_pct=0.2,
-    min_replays=8,
+    min_replays=4,
     max_replays=40,
     max_time_s=1.0,
     target_replay_ms=20.0,
 ):
-    """Benchmark fn by capturing many launches into a CUDA graph.
+    """Benchmark fn with per-kernel timing inside a captured CUDA graph.
 
     Probes the kernel time, sizes one capture so a replay runs ~target_replay_ms
     (so the GPU stays continuously busy and DVFS doesn't drop the clock between
-    launches), captures `iters_per_replay` calls of fn(0..iters-1), and times
-    repeated replays. fn(i) lets callers rotate weight buffers.
+    launches), captures `iters_per_replay` calls of fn(0..iters-1), each
+    bracketed by hipEventRecord(EXTERNAL) so per-kernel timestamps are queryable
+    on replay. fn(i) lets callers rotate weight buffers.
 
-    Returns (median_ms_per_kernel, num_kernels_timed, se_pct).
+    Returns (median_ms_per_kernel, num_samples, se_pct).
     """
     # 1) Probe one kernel to size the graph.
     fn(0)
@@ -97,46 +166,44 @@ def bench_dynamic(
     probe_ms = max(probe_start.elapsed_time(probe_end), 1e-3)
     iters_per_replay = max(2, min(2000, int(target_replay_ms / probe_ms)))
 
-    # 2) Warm + capture on a side stream. Run a few more launches inside the
-    #    capture stream before recording so the caching allocator is settled.
-    s = torch.cuda.Stream()
-    s.wait_stream(torch.cuda.current_stream())
-    with torch.cuda.stream(s):
-        for i in range(5):
-            fn(i)
-    torch.cuda.current_stream().wait_stream(s)
+    # 2) Allocate a chain of iters_per_replay+1 events. The i-th per-kernel
+    #    time is events[i].elapsed_time(events[i+1]). Force handle creation
+    #    on the default stream so the underlying hipEvent_t exists before
+    #    the stream-capture region.
+    events = [_make_event() for _ in range(iters_per_replay + 1)]
     torch.accelerator.synchronize()
 
+    # 3) Capture on a side stream, recording the event chain with EXTERNAL.
+    s = torch.cuda.Stream()
     g = torch.cuda.CUDAGraph()
     with torch.cuda.graph(g, stream=s):
+        _record_external(events[0], s)
         for i in range(iters_per_replay):
             fn(i)
+            _record_external(events[i + 1], s)
 
     # Warm one replay to absorb first-launch cost.
     g.replay()
     torch.accelerator.synchronize()
 
-    # 3) Time replays adaptively.
-    times = []
-    start_ev = torch.Event(enable_timing=True)
-    end_ev = torch.Event(enable_timing=True)
+    # 4) Time replays adaptively. Each replay yields iters_per_replay samples.
+    samples = []
     wall_start = time.monotonic()
     for r in range(max_replays):
-        start_ev.record()
         g.replay()
-        end_ev.record()
         torch.accelerator.synchronize()
-        times.append(start_ev.elapsed_time(end_ev) / iters_per_replay)
+        for i in range(iters_per_replay):
+            samples.append(_elapsed_ms(events[i], events[i + 1]))
 
-        if len(times) >= min_replays and len(times) % 5 == 0:
-            med, se_pct = _median_se(sorted(times))
+        if r + 1 >= min_replays:
+            med, se_pct = _median_se(sorted(samples))
             if se_pct < target_se_pct:
-                return med, len(times) * iters_per_replay, se_pct
+                return med, len(samples), se_pct
             if time.monotonic() - wall_start > max_time_s:
-                return med, len(times) * iters_per_replay, se_pct
+                return med, len(samples), se_pct
 
-    med, se_pct = _median_se(sorted(times))
-    return med, len(times) * iters_per_replay, se_pct
+    med, se_pct = _median_se(sorted(samples))
+    return med, len(samples), se_pct
 
 
 def parse_shape(s):
@@ -156,11 +223,8 @@ def run_bench(shapes, batch_sizes, dtype, target_se_pct):
     print(f"Shapes: {len(shapes)}, Batch sizes: {batch_sizes}")
     print()
 
-    print(
-        f"{'N':>2} {'M':>6}x{'K':<6} {'Label':<22} "
-        f"{'time_us':>9} {'BW GiB/s':>9} {'bufs':>5} {'iters':>6} {'SE%':>5}"
-    )
-    print("-" * 80)
+    print(f"{'N':>2} {'M':>6}x{'K':<6} {'Label':<22} {'med_us':>9} {'med_GiB/s':>10}")
+    print("-" * 60)
 
     t0 = time.time()
     for M, K, label in shapes:
@@ -184,17 +248,14 @@ def run_bench(shapes, batch_sizes, dtype, target_se_pct):
             fn = lambda i, ws=weights, a=activation: ops.wvSplitK(
                 ws[i % len(ws)], a, cu_count
             )
-            med_ms, iters, se_pct = bench_dynamic(
+            med_ms, _, _ = bench_dynamic(
                 fn,
                 target_se_pct=target_se_pct,
             )
-            time_us = med_ms * 1000
-            bw_gibs = weight_bytes / (med_ms * 1e-3) / (1 << 30)
+            med_us = med_ms * 1000
+            med_bw = weight_bytes / (med_ms * 1e-3) / (1 << 30)
 
-            print(
-                f"{N:>2} {M:>6}x{K:<6} {label:<22} "
-                f"{time_us:>8.1f} {bw_gibs:>8.1f} {n_bufs:>5} {iters:>6} {se_pct:>5.2f}"
-            )
+            print(f"{N:>2} {M:>6}x{K:<6} {label:<22} {med_us:>8.1f} {med_bw:>9.1f}")
 
     elapsed = time.time() - t0
     print()