diff --git a/External/HIP/CMakeLists.txt b/External/HIP/CMakeLists.txt index 826315e11883..74377f27a8ca 100644 --- a/External/HIP/CMakeLists.txt +++ b/External/HIP/CMakeLists.txt @@ -76,6 +76,7 @@ macro(create_local_hip_tests VariantSuffix) list(APPEND HIP_LOCAL_TESTS memset) list(APPEND HIP_LOCAL_TESTS split-kernel-args) list(APPEND HIP_LOCAL_TESTS builtin-logb-scalbn) + list(APPEND HIP_LOCAL_TESTS pgo-tiled-matmul) list(APPEND HIP_LOCAL_TESTS InOneWeekend) list(APPEND HIP_LOCAL_TESTS TheNextWeek) @@ -122,6 +123,14 @@ macro(create_local_hip_tests VariantSuffix) message(WARNING "hipify-perl not found for ROCm installation in ${_RocmPath}.") endif() + # Add test for PGO tiled matmul. + configure_file(workload/pgo/test_pgo_matmul.sh.in ${CMAKE_CURRENT_BINARY_DIR}/test_pgo_matmul.sh @ONLY) + configure_file(workload/pgo/verify_pgo_matmul.sh.in ${CMAKE_CURRENT_BINARY_DIR}/verify_pgo_matmul.sh @ONLY) + llvm_test_run(EXECUTABLE "/bin/bash" "test_pgo_matmul.sh") + llvm_test_verify(/bin/bash verify_pgo_matmul.sh %o) + llvm_add_test(pgo-tiled-matmul-pipeline.test test_pgo_matmul.sh) + list(APPEND VARIANT_SIMPLE_TEST_TARGETS pgo-tiled-matmul-pipeline.test) + # Add test for Blender. configure_file(workload/blender/test_blender.sh.in ${CMAKE_CURRENT_BINARY_DIR}/test_blender.sh @ONLY) configure_file(workload/blender/verify_blender.sh.in ${CMAKE_CURRENT_BINARY_DIR}/verify_blender.sh @ONLY) diff --git a/External/HIP/pgo-tiled-matmul.hip b/External/HIP/pgo-tiled-matmul.hip new file mode 100644 index 000000000000..610ce26a350b --- /dev/null +++ b/External/HIP/pgo-tiled-matmul.hip @@ -0,0 +1,317 @@ +// pgo-tiled-matmul.hip - Tiled matmul kernel for offload PGO E2E testing +// +// This test demonstrates how Profile-Guided Optimization (PGO) improves +// GPU kernel performance by reducing register spills on hot code paths. +// +// == What the kernel does == +// +// The kernel performs a grouped batched matrix multiply (C = A × B) using +// a tiled algorithm, similar to how high-performance GPU libraries like +// AMD's Composable Kernel (CK) implement convolution and GEMM operations. +// +// Each GPU thread block has 32 threads (one wavefront) and handles a +// 128×64 tile of the output matrix. Each thread computes a 16×16 +// sub-tile of the output, accumulating 256 partial sums across +// K-dimension tile iterations. The threads cooperatively load input +// tiles into shared memory (LDS). +// +// The single-wavefront configuration (32 threads) is chosen because it +// achieves higher throughput than 256 threads due to better LDS +// utilization and fewer wasted boundary threads. This also matches the +// occupancy pattern of high-performance GPU kernels like CK's +// grouped_conv_bwd_weight, which runs at 1 wave/SIMD with all VGPRs +// utilized. +// +// The matrix dimensions (default 1000×1000) intentionally do not divide +// evenly into tiles, creating two code paths: +// - Interior tiles (majority): fast path with unconditional memory loads +// - Boundary tiles (edges): slow path with per-element bounds checking +// +// == Why PGO helps == +// +// The 256 float accumulators per thread (plus temporary registers for +// loading from LDS) exceed the GPU's 256-VGPR register file, forcing the +// compiler to "spill" some register values to slower scratch memory. +// +// Without PGO, the compiler has no information about which code path runs +// more often, so it makes register allocation decisions without favoring +// either path. This can result in hot-path values being spilled. +// +// With PGO (-fprofile-generate / -fprofile-use), the compiler learns from +// runtime profile data that interior tiles execute ~2x more often than +// boundary tiles. The register allocator then prioritizes keeping the +// hot interior path's values in registers, spilling the cold boundary +// path's values instead. Fewer spills on the hot path means fewer slow +// scratch memory accesses during the majority of the kernel's execution. +// +// The kernel structure mirrors real workloads like CK's +// grouped_conv_bwd_weight kernel. +// +// == Usage == +// +// ./pgo-tiled-matmul +// ./pgo-tiled-matmul 1000 1000 500 4 5 1 +// +// == Full PGO pipeline == +// +// # 1. Build instrumented binary +// clang++ -O3 --offload-arch=gfx1100 -x hip \ +// -fprofile-generate=./profiles pgo-tiled-matmul.hip -o pgo_gen +// +// # 2. Run to collect device profiles +// ./pgo_gen 1000 1000 500 4 1 0 +// +// # 3. Merge device profiles +// llvm-profdata merge -o device.profdata profiles/*.gfx1100.*.profraw +// +// # 4. Build PGO-optimized binary +// clang++ -O3 --offload-arch=gfx1100 -x hip \ +// -Xarch_device -fprofile-use=device.profdata \ +// pgo-tiled-matmul.hip -o pgo_use +// +// # 5. Compare: ./pgo_use should be faster than baseline + +#include +#include +#include +#include + +// Sub-tile dimensions are configurable via -D flags to tune register +// pressure per GPU architecture: +// gfx1100 (256 VGPRs): default TH_M=16, TH_N=16 → 256 accumulators → spills +// gfx950 (512 VGPRs): -DTH_M=20 -DTH_N=16 → 320 accumulators → spills +#ifndef TH_M +#define TH_M 16 +#endif +#ifndef TH_N +#define TH_N 16 +#endif +#ifndef TILE_M +#define TILE_M (TH_M * 8) +#endif +#ifndef TILE_N +#define TILE_N (TH_N * 4) +#endif +#define TILE_K 8 +#define THREADS_M (TILE_M / TH_M) +#define THREADS_N (TILE_N / TH_N) +#define BLOCK_SIZE (THREADS_M * THREADS_N) + +__global__ __launch_bounds__(BLOCK_SIZE) +void tiled_matmul_kernel( + const float* __restrict__ A, + const float* __restrict__ B, + float* __restrict__ C, + int M, int N, int K +) { + int group = blockIdx.z; + int tile_row = blockIdx.y; + int tile_col = blockIdx.x; + + int tid = threadIdx.x; + int tr = tid / THREADS_N; + int tc = tid % THREADS_N; + + int row_base = tile_row * TILE_M + tr * TH_M; + int col_base = tile_col * TILE_N + tc * TH_N; + + __shared__ float sA[TILE_M][TILE_K]; + __shared__ float sB[TILE_K][TILE_N]; + + float acc[TH_M][TH_N]; + #pragma unroll + for (int i = 0; i < TH_M; i++) + #pragma unroll + for (int j = 0; j < TH_N; j++) + acc[i][j] = 0.0f; + + size_t plane_a = (size_t)M * K; + size_t plane_b = (size_t)K * N; + size_t plane_c = (size_t)M * N; + const float* A_g = A + (size_t)group * plane_a; + const float* B_g = B + (size_t)group * plane_b; + + bool tile_bnd_row = ((tile_row + 1) * TILE_M > M); + bool tile_bnd_col = ((tile_col + 1) * TILE_N > N); + + for (int kt = 0; kt < K; kt += TILE_K) { + bool tile_bnd_k = (kt + TILE_K > K); + + if (!tile_bnd_row && !tile_bnd_k) { + for (int i = tid; i < TILE_M * TILE_K; i += BLOCK_SIZE) { + int r = i / TILE_K; + int c = i % TILE_K; + sA[r][c] = A_g[(tile_row * TILE_M + r) * K + kt + c]; + } + } else { + for (int i = tid; i < TILE_M * TILE_K; i += BLOCK_SIZE) { + int r = i / TILE_K; + int c = i % TILE_K; + int gr = tile_row * TILE_M + r; + int gc = kt + c; + sA[r][c] = (gr < M && gc < K) ? A_g[gr * K + gc] : 0.0f; + } + } + + if (!tile_bnd_col && !tile_bnd_k) { + for (int i = tid; i < TILE_K * TILE_N; i += BLOCK_SIZE) { + int r = i / TILE_N; + int c = i % TILE_N; + sB[r][c] = B_g[(kt + r) * N + tile_col * TILE_N + c]; + } + } else { + for (int i = tid; i < TILE_K * TILE_N; i += BLOCK_SIZE) { + int r = i / TILE_N; + int c = i % TILE_N; + int gr = kt + r; + int gc = tile_col * TILE_N + c; + sB[r][c] = (gr < K && gc < N) ? B_g[gr * N + gc] : 0.0f; + } + } + + __syncthreads(); + + #pragma unroll + for (int kk = 0; kk < TILE_K; kk++) { + float a_reg[TH_M]; + float b_reg[TH_N]; + + #pragma unroll + for (int rm = 0; rm < TH_M; rm++) + a_reg[rm] = sA[tr * TH_M + rm][kk]; + #pragma unroll + for (int rn = 0; rn < TH_N; rn++) + b_reg[rn] = sB[kk][tc * TH_N + rn]; + + #pragma unroll + for (int rm = 0; rm < TH_M; rm++) + #pragma unroll + for (int rn = 0; rn < TH_N; rn++) + acc[rm][rn] += a_reg[rm] * b_reg[rn]; + } + + __syncthreads(); + } + + float* C_g = C + (size_t)group * plane_c; + bool out_bnd_row = (row_base + TH_M > M); + bool out_bnd_col = (col_base + TH_N > N); + + if (!out_bnd_row && !out_bnd_col) { + #pragma unroll + for (int rm = 0; rm < TH_M; rm++) + #pragma unroll + for (int rn = 0; rn < TH_N; rn++) + C_g[(row_base + rm) * N + col_base + rn] = acc[rm][rn]; + } else { + for (int rm = 0; rm < TH_M; rm++) + for (int rn = 0; rn < TH_N; rn++) + if (row_base + rm < M && col_base + rn < N) + C_g[(row_base + rm) * N + col_base + rn] = acc[rm][rn]; + } +} + +void matmul_ref(const float* A, const float* B, float* C, + int M, int N, int K, int groups) { + for (int g = 0; g < groups; g++) { + const float* Ag = A + (size_t)g * M * K; + const float* Bg = B + (size_t)g * K * N; + float* Cg = C + (size_t)g * M * N; + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) { + double sum = 0.0; + for (int k = 0; k < K; k++) + sum += (double)Ag[i * K + k] * (double)Bg[k * N + j]; + Cg[i * N + j] = (float)sum; + } + } +} + +int main(int argc, char** argv) { + int M = (argc > 1) ? atoi(argv[1]) : 1000; + int N = (argc > 2) ? atoi(argv[2]) : 1000; + int K = (argc > 3) ? atoi(argv[3]) : 500; + int groups = (argc > 4) ? atoi(argv[4]) : 4; + int runs = (argc > 5) ? atoi(argv[5]) : 5; + int verify = (argc > 6) ? atoi(argv[6]) : 1; + + size_t elems_A = (size_t)groups * M * K; + size_t elems_B = (size_t)groups * K * N; + size_t elems_C = (size_t)groups * M * N; + + float *h_A = (float*)malloc(elems_A * sizeof(float)); + float *h_B = (float*)malloc(elems_B * sizeof(float)); + float *h_C = (float*)malloc(elems_C * sizeof(float)); + float *h_ref = verify ? (float*)malloc(elems_C * sizeof(float)) : nullptr; + + srand(42); + for (size_t i = 0; i < elems_A; i++) + h_A[i] = (float)(rand() % 20 - 10) / 50.0f; + for (size_t i = 0; i < elems_B; i++) + h_B[i] = (float)(rand() % 20 - 10) / 50.0f; + + float *d_A, *d_B, *d_C; + hipMalloc(&d_A, elems_A * sizeof(float)); + hipMalloc(&d_B, elems_B * sizeof(float)); + hipMalloc(&d_C, elems_C * sizeof(float)); + hipMemcpy(d_A, h_A, elems_A * sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(d_B, h_B, elems_B * sizeof(float), hipMemcpyHostToDevice); + + int tiles_m = (M + TILE_M - 1) / TILE_M; + int tiles_n = (N + TILE_N - 1) / TILE_N; + dim3 grid(tiles_n, tiles_m, groups); + dim3 block(BLOCK_SIZE); + + tiled_matmul_kernel<<>>(d_A, d_B, d_C, M, N, K); + hipDeviceSynchronize(); + + if (verify) { + hipMemcpy(h_C, d_C, elems_C * sizeof(float), hipMemcpyDeviceToHost); + matmul_ref(h_A, h_B, h_ref, M, N, K, groups); + float max_err = 0.0f; + int errors = 0; + float tol = 5e-2f + K * 1e-3f; + for (size_t i = 0; i < elems_C; i++) { + float diff = fabsf(h_C[i] - h_ref[i]); + float denom = fmaxf(fabsf(h_ref[i]), 1e-6f); + float rel = diff / denom; + if (rel > max_err) max_err = rel; + if (rel > tol) errors++; + } + if (errors != 0) { + printf("%d errors (max_rel_err=%.6f tol=%.4f)\n", errors, max_err, tol); + free(h_A); free(h_B); free(h_C); free(h_ref); + hipFree(d_A); hipFree(d_B); hipFree(d_C); + return 1; + } + } + + if (runs > 0) { + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + + float total_ms = 0.0f; + for (int r = 0; r < runs; r++) { + hipEventRecord(start); + tiled_matmul_kernel<<>>(d_A, d_B, d_C, M, N, K); + hipEventRecord(stop); + hipEventSynchronize(stop); + float ms; + hipEventElapsedTime(&ms, start, stop); + total_ms += ms; + } + float avg_ms = total_ms / runs; + double gflops = 2.0 * groups * M * N * K / (avg_ms * 1e-3) / 1e9; + printf("Average: %.3f ms, %.1f GFLOPS\n", avg_ms, gflops); + + hipEventDestroy(start); + hipEventDestroy(stop); + } + + printf("PASSED!\n"); + + hipFree(d_A); hipFree(d_B); hipFree(d_C); + free(h_A); free(h_B); free(h_C); free(h_ref); + return 0; +} diff --git a/External/HIP/pgo-tiled-matmul.reference_output b/External/HIP/pgo-tiled-matmul.reference_output new file mode 100644 index 000000000000..391efdf648c3 --- /dev/null +++ b/External/HIP/pgo-tiled-matmul.reference_output @@ -0,0 +1,2 @@ +PASSED! +exit 0 diff --git a/External/HIP/workload/pgo/test_pgo_matmul.sh.in b/External/HIP/workload/pgo/test_pgo_matmul.sh.in new file mode 100644 index 000000000000..fb8e8cf6b6ba --- /dev/null +++ b/External/HIP/workload/pgo/test_pgo_matmul.sh.in @@ -0,0 +1,118 @@ +#!/bin/bash +# test_pgo_matmul.sh - Full offload PGO pipeline E2E test +# +# Tests the complete -fprofile-generate / -fprofile-use workflow for HIP: +# 1. Build and run baseline (no PGO) +# 2. Build instrumented binary, run to collect device profiles +# 3. Merge profiles with llvm-profdata +# 4. Build PGO-optimized binary, run and compare +# +# The test uses a tiled matmul kernel with 256 float accumulators per thread +# that naturally exceeds the VGPR budget on AMDGPU, causing register spills. +# PGO reduces spills on the hot (interior tile) path. + +set -e + +CLANG="@CMAKE_CXX_COMPILER@" +CLANG_DIR=$(dirname "$CLANG") +LLVM_PROFDATA="$CLANG_DIR/llvm-profdata" +ROCM_PATH="@_RocmPath@" +ARCH_FLAGS="@_ArchFlags@" +SRC="@CMAKE_CURRENT_SOURCE_DIR@/pgo-tiled-matmul.hip" +WORK_DIR="." + +# Problem size: non-tile-aligned to create boundary tiles +M=1000 N=1000 K=500 GROUPS=4 +WARMUP=3 +BENCH_RUNS=5 + +# Detect GPU architecture and set sub-tile size to induce register spills. +# gfx950 has 512 VGPRs (vs 256 on gfx1100), so needs larger sub-tiles. +GPU_ARCH=$("$CLANG_DIR/amdgpu-arch" 2>/dev/null | head -1 || echo "") +TILE_FLAGS="" +if echo "$GPU_ARCH" | grep -q "gfx950"; then + TILE_FLAGS="-DTH_M=20 -DTH_N=16" + log "gfx950 detected, using larger sub-tile (20x16=320 accumulators)" +fi + +log() { echo "[pgo-test] $*"; } + +compile() { + local output=$1 + local extra_flags=$2 + "$CLANG" -O3 $ARCH_FLAGS -x hip \ + --rocm-path="$ROCM_PATH" \ + $TILE_FLAGS $extra_flags \ + "$SRC" -o "$output" \ + --hip-link -rtlib=compiler-rt -unwindlib=libgcc -frtlib-add-rpath \ + 2>&1 | grep -v "warning:" || true +} + +run_kernel() { + local binary=$1 + local runs=$2 + local verify=${3:-0} + "$binary" $M $N $K $GROUPS $runs $verify +} + +extract_avg_ms() { + grep "Average:" "$1" | awk '{print $2}' +} + +# Step 1: Build baseline +log "Step 1/6: Build baseline" +compile "$WORK_DIR/baseline" "" + +# Step 2: Run baseline (warmup + benchmark) +log "Step 2/6: Run baseline" +run_kernel "$WORK_DIR/baseline" 1 1 > /dev/null # verify correctness +run_kernel "$WORK_DIR/baseline" $WARMUP 0 > /dev/null # warmup +run_kernel "$WORK_DIR/baseline" $BENCH_RUNS 0 > "$WORK_DIR/baseline_output.txt" +BASELINE_AVG=$(extract_avg_ms "$WORK_DIR/baseline_output.txt") +log " Baseline average: $BASELINE_AVG ms" + +# Step 3: Build instrumented +log "Step 3/6: Build instrumented" +PROFILE_DIR="$WORK_DIR/pgo_profiles" +rm -rf "$PROFILE_DIR" +mkdir -p "$PROFILE_DIR" +compile "$WORK_DIR/pgo_gen" "-fprofile-generate=$PROFILE_DIR" + +# Step 4: Collect profiles +log "Step 4/6: Collect profiles" +LLVM_PROFILE_FILE="$PROFILE_DIR/default_%m.profraw" \ + run_kernel "$WORK_DIR/pgo_gen" 1 0 > /dev/null + +DEVICE_PROFS=$(find "$PROFILE_DIR" -name "*.gfx*.profraw" 2>/dev/null || true) +if [ -z "$DEVICE_PROFS" ]; then + log "ERROR: No device profiles generated" + echo "PGO test FAILED: no device profiles" + exit 1 +fi + +# Step 5: Merge profiles and build PGO-optimized +log "Step 5/6: Merge profiles and build PGO-optimized" +"$LLVM_PROFDATA" merge -o "$WORK_DIR/device.profdata" $DEVICE_PROFS + +compile "$WORK_DIR/pgo_use" "-Xarch_device -fprofile-use=$WORK_DIR/device.profdata" + +# Step 6: Run PGO-optimized (warmup + benchmark) +log "Step 6/6: Run PGO-optimized" +run_kernel "$WORK_DIR/pgo_use" 1 1 > /dev/null # verify correctness +run_kernel "$WORK_DIR/pgo_use" $WARMUP 0 > /dev/null # warmup +run_kernel "$WORK_DIR/pgo_use" $BENCH_RUNS 0 > "$WORK_DIR/pgo_output.txt" +PGO_AVG=$(extract_avg_ms "$WORK_DIR/pgo_output.txt") +log " PGO average: $PGO_AVG ms" + +# Report +echo "" +echo "=== PGO Tiled Matmul Results ===" +echo "Baseline: $BASELINE_AVG ms" +echo "PGO: $PGO_AVG ms" + +if [ -n "$BASELINE_AVG" ] && [ -n "$PGO_AVG" ]; then + SPEEDUP=$(echo "scale=1; ($BASELINE_AVG - $PGO_AVG) / $BASELINE_AVG * 100" | bc 2>/dev/null || echo "N/A") + echo "Speedup: ${SPEEDUP}% (positive = PGO faster)" +fi + +echo "PGO test PASSED" diff --git a/External/HIP/workload/pgo/verify_pgo_matmul.sh.in b/External/HIP/workload/pgo/verify_pgo_matmul.sh.in new file mode 100644 index 000000000000..a00c75ba1962 --- /dev/null +++ b/External/HIP/workload/pgo/verify_pgo_matmul.sh.in @@ -0,0 +1,13 @@ +#!/bin/bash +# Verify the PGO matmul test output. +# The test passes if the full PGO pipeline completed successfully +# (correctness verified, all steps ran). Performance difference is +# reported for information only. + +if grep -q "PGO test PASSED" "$1"; then + exit 0 +fi + +echo "PGO pipeline did not complete:" +cat "$1" +exit 1