diff --git a/External/HIP/CMakeLists.txt b/External/HIP/CMakeLists.txt
index 826315e11883..74377f27a8ca 100644
--- a/External/HIP/CMakeLists.txt
+++ b/External/HIP/CMakeLists.txt
@@ -76,6 +76,7 @@ macro(create_local_hip_tests VariantSuffix)
   list(APPEND HIP_LOCAL_TESTS memset)
   list(APPEND HIP_LOCAL_TESTS split-kernel-args)
   list(APPEND HIP_LOCAL_TESTS builtin-logb-scalbn)
+  list(APPEND HIP_LOCAL_TESTS pgo-tiled-matmul)
 
   list(APPEND HIP_LOCAL_TESTS InOneWeekend)
   list(APPEND HIP_LOCAL_TESTS TheNextWeek)
@@ -122,6 +123,14 @@ macro(create_local_hip_tests VariantSuffix)
       message(WARNING "hipify-perl not found for ROCm installation in ${_RocmPath}.")
   endif()
 
+  # Add test for PGO tiled matmul.
+  configure_file(workload/pgo/test_pgo_matmul.sh.in ${CMAKE_CURRENT_BINARY_DIR}/test_pgo_matmul.sh @ONLY)
+  configure_file(workload/pgo/verify_pgo_matmul.sh.in ${CMAKE_CURRENT_BINARY_DIR}/verify_pgo_matmul.sh @ONLY)
+  llvm_test_run(EXECUTABLE "/bin/bash" "test_pgo_matmul.sh")
+  llvm_test_verify(/bin/bash verify_pgo_matmul.sh %o)
+  llvm_add_test(pgo-tiled-matmul-pipeline.test test_pgo_matmul.sh)
+  list(APPEND VARIANT_SIMPLE_TEST_TARGETS pgo-tiled-matmul-pipeline.test)
+
   # Add test for Blender.
   configure_file(workload/blender/test_blender.sh.in ${CMAKE_CURRENT_BINARY_DIR}/test_blender.sh @ONLY)
   configure_file(workload/blender/verify_blender.sh.in ${CMAKE_CURRENT_BINARY_DIR}/verify_blender.sh @ONLY)
diff --git a/External/HIP/pgo-tiled-matmul.hip b/External/HIP/pgo-tiled-matmul.hip
new file mode 100644
index 000000000000..610ce26a350b
--- /dev/null
+++ b/External/HIP/pgo-tiled-matmul.hip
@@ -0,0 +1,317 @@
+// pgo-tiled-matmul.hip - Tiled matmul kernel for offload PGO E2E testing
+//
+// This test demonstrates how Profile-Guided Optimization (PGO) improves
+// GPU kernel performance by reducing register spills on hot code paths.
+//
+// == What the kernel does ==
+//
+// The kernel performs a grouped batched matrix multiply (C = A × B) using
+// a tiled algorithm, similar to how high-performance GPU libraries like
+// AMD's Composable Kernel (CK) implement convolution and GEMM operations.
+//
+// Each GPU thread block has 32 threads (one wavefront) and handles a
+// 128×64 tile of the output matrix. Each thread computes a 16×16
+// sub-tile of the output, accumulating 256 partial sums across
+// K-dimension tile iterations. The threads cooperatively load input
+// tiles into shared memory (LDS).
+//
+// The single-wavefront configuration (32 threads) is chosen because it
+// achieves higher throughput than 256 threads due to better LDS
+// utilization and fewer wasted boundary threads. This also matches the
+// occupancy pattern of high-performance GPU kernels like CK's
+// grouped_conv_bwd_weight, which runs at 1 wave/SIMD with all VGPRs
+// utilized.
+//
+// The matrix dimensions (default 1000×1000) intentionally do not divide
+// evenly into tiles, creating two code paths:
+//   - Interior tiles (majority): fast path with unconditional memory loads
+//   - Boundary tiles (edges): slow path with per-element bounds checking
+//
+// == Why PGO helps ==
+//
+// The 256 float accumulators per thread (plus temporary registers for
+// loading from LDS) exceed the GPU's 256-VGPR register file, forcing the
+// compiler to "spill" some register values to slower scratch memory.
+//
+// Without PGO, the compiler has no information about which code path runs
+// more often, so it makes register allocation decisions without favoring
+// either path. This can result in hot-path values being spilled.
+//
+// With PGO (-fprofile-generate / -fprofile-use), the compiler learns from
+// runtime profile data that interior tiles execute ~2x more often than
+// boundary tiles. The register allocator then prioritizes keeping the
+// hot interior path's values in registers, spilling the cold boundary
+// path's values instead. Fewer spills on the hot path means fewer slow
+// scratch memory accesses during the majority of the kernel's execution.
+//
+// The kernel structure mirrors real workloads like CK's
+// grouped_conv_bwd_weight kernel.
+//
+// == Usage ==
+//
+//   ./pgo-tiled-matmul <M> <N> <K> <groups> <runs> <verify>
+//   ./pgo-tiled-matmul 1000 1000 500 4 5 1
+//
+// == Full PGO pipeline ==
+//
+//   # 1. Build instrumented binary
+//   clang++ -O3 --offload-arch=gfx1100 -x hip \
+//       -fprofile-generate=./profiles pgo-tiled-matmul.hip -o pgo_gen
+//
+//   # 2. Run to collect device profiles
+//   ./pgo_gen 1000 1000 500 4 1 0
+//
+//   # 3. Merge device profiles
+//   llvm-profdata merge -o device.profdata profiles/*.gfx1100.*.profraw
+//
+//   # 4. Build PGO-optimized binary
+//   clang++ -O3 --offload-arch=gfx1100 -x hip \
+//       -Xarch_device -fprofile-use=device.profdata \
+//       pgo-tiled-matmul.hip -o pgo_use
+//
+//   # 5. Compare: ./pgo_use should be faster than baseline
+
+#include <hip/hip_runtime.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+// Sub-tile dimensions are configurable via -D flags to tune register
+// pressure per GPU architecture:
+//   gfx1100 (256 VGPRs): default TH_M=16, TH_N=16 → 256 accumulators → spills
+//   gfx950  (512 VGPRs): -DTH_M=20 -DTH_N=16 → 320 accumulators → spills
+#ifndef TH_M
+#define TH_M 16
+#endif
+#ifndef TH_N
+#define TH_N 16
+#endif
+#ifndef TILE_M
+#define TILE_M (TH_M * 8)
+#endif
+#ifndef TILE_N
+#define TILE_N (TH_N * 4)
+#endif
+#define TILE_K 8
+#define THREADS_M (TILE_M / TH_M)
+#define THREADS_N (TILE_N / TH_N)
+#define BLOCK_SIZE (THREADS_M * THREADS_N)
+
+__global__ __launch_bounds__(BLOCK_SIZE)
+void tiled_matmul_kernel(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    float* __restrict__ C,
+    int M, int N, int K
+) {
+    int group = blockIdx.z;
+    int tile_row = blockIdx.y;
+    int tile_col = blockIdx.x;
+
+    int tid = threadIdx.x;
+    int tr = tid / THREADS_N;
+    int tc = tid % THREADS_N;
+
+    int row_base = tile_row * TILE_M + tr * TH_M;
+    int col_base = tile_col * TILE_N + tc * TH_N;
+
+    __shared__ float sA[TILE_M][TILE_K];
+    __shared__ float sB[TILE_K][TILE_N];
+
+    float acc[TH_M][TH_N];
+    #pragma unroll
+    for (int i = 0; i < TH_M; i++)
+        #pragma unroll
+        for (int j = 0; j < TH_N; j++)
+            acc[i][j] = 0.0f;
+
+    size_t plane_a = (size_t)M * K;
+    size_t plane_b = (size_t)K * N;
+    size_t plane_c = (size_t)M * N;
+    const float* A_g = A + (size_t)group * plane_a;
+    const float* B_g = B + (size_t)group * plane_b;
+
+    bool tile_bnd_row = ((tile_row + 1) * TILE_M > M);
+    bool tile_bnd_col = ((tile_col + 1) * TILE_N > N);
+
+    for (int kt = 0; kt < K; kt += TILE_K) {
+        bool tile_bnd_k = (kt + TILE_K > K);
+
+        if (!tile_bnd_row && !tile_bnd_k) {
+            for (int i = tid; i < TILE_M * TILE_K; i += BLOCK_SIZE) {
+                int r = i / TILE_K;
+                int c = i % TILE_K;
+                sA[r][c] = A_g[(tile_row * TILE_M + r) * K + kt + c];
+            }
+        } else {
+            for (int i = tid; i < TILE_M * TILE_K; i += BLOCK_SIZE) {
+                int r = i / TILE_K;
+                int c = i % TILE_K;
+                int gr = tile_row * TILE_M + r;
+                int gc = kt + c;
+                sA[r][c] = (gr < M && gc < K) ? A_g[gr * K + gc] : 0.0f;
+            }
+        }
+
+        if (!tile_bnd_col && !tile_bnd_k) {
+            for (int i = tid; i < TILE_K * TILE_N; i += BLOCK_SIZE) {
+                int r = i / TILE_N;
+                int c = i % TILE_N;
+                sB[r][c] = B_g[(kt + r) * N + tile_col * TILE_N + c];
+            }
+        } else {
+            for (int i = tid; i < TILE_K * TILE_N; i += BLOCK_SIZE) {
+                int r = i / TILE_N;
+                int c = i % TILE_N;
+                int gr = kt + r;
+                int gc = tile_col * TILE_N + c;
+                sB[r][c] = (gr < K && gc < N) ? B_g[gr * N + gc] : 0.0f;
+            }
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int kk = 0; kk < TILE_K; kk++) {
+            float a_reg[TH_M];
+            float b_reg[TH_N];
+
+            #pragma unroll
+            for (int rm = 0; rm < TH_M; rm++)
+                a_reg[rm] = sA[tr * TH_M + rm][kk];
+            #pragma unroll
+            for (int rn = 0; rn < TH_N; rn++)
+                b_reg[rn] = sB[kk][tc * TH_N + rn];
+
+            #pragma unroll
+            for (int rm = 0; rm < TH_M; rm++)
+                #pragma unroll
+                for (int rn = 0; rn < TH_N; rn++)
+                    acc[rm][rn] += a_reg[rm] * b_reg[rn];
+        }
+
+        __syncthreads();
+    }
+
+    float* C_g = C + (size_t)group * plane_c;
+    bool out_bnd_row = (row_base + TH_M > M);
+    bool out_bnd_col = (col_base + TH_N > N);
+
+    if (!out_bnd_row && !out_bnd_col) {
+        #pragma unroll
+        for (int rm = 0; rm < TH_M; rm++)
+            #pragma unroll
+            for (int rn = 0; rn < TH_N; rn++)
+                C_g[(row_base + rm) * N + col_base + rn] = acc[rm][rn];
+    } else {
+        for (int rm = 0; rm < TH_M; rm++)
+            for (int rn = 0; rn < TH_N; rn++)
+                if (row_base + rm < M && col_base + rn < N)
+                    C_g[(row_base + rm) * N + col_base + rn] = acc[rm][rn];
+    }
+}
+
+void matmul_ref(const float* A, const float* B, float* C,
+                int M, int N, int K, int groups) {
+    for (int g = 0; g < groups; g++) {
+        const float* Ag = A + (size_t)g * M * K;
+        const float* Bg = B + (size_t)g * K * N;
+        float* Cg = C + (size_t)g * M * N;
+        for (int i = 0; i < M; i++)
+            for (int j = 0; j < N; j++) {
+                double sum = 0.0;
+                for (int k = 0; k < K; k++)
+                    sum += (double)Ag[i * K + k] * (double)Bg[k * N + j];
+                Cg[i * N + j] = (float)sum;
+            }
+    }
+}
+
+int main(int argc, char** argv) {
+    int M = (argc > 1) ? atoi(argv[1]) : 1000;
+    int N = (argc > 2) ? atoi(argv[2]) : 1000;
+    int K = (argc > 3) ? atoi(argv[3]) : 500;
+    int groups = (argc > 4) ? atoi(argv[4]) : 4;
+    int runs = (argc > 5) ? atoi(argv[5]) : 5;
+    int verify = (argc > 6) ? atoi(argv[6]) : 1;
+
+    size_t elems_A = (size_t)groups * M * K;
+    size_t elems_B = (size_t)groups * K * N;
+    size_t elems_C = (size_t)groups * M * N;
+
+    float *h_A = (float*)malloc(elems_A * sizeof(float));
+    float *h_B = (float*)malloc(elems_B * sizeof(float));
+    float *h_C = (float*)malloc(elems_C * sizeof(float));
+    float *h_ref = verify ? (float*)malloc(elems_C * sizeof(float)) : nullptr;
+
+    srand(42);
+    for (size_t i = 0; i < elems_A; i++)
+        h_A[i] = (float)(rand() % 20 - 10) / 50.0f;
+    for (size_t i = 0; i < elems_B; i++)
+        h_B[i] = (float)(rand() % 20 - 10) / 50.0f;
+
+    float *d_A, *d_B, *d_C;
+    hipMalloc(&d_A, elems_A * sizeof(float));
+    hipMalloc(&d_B, elems_B * sizeof(float));
+    hipMalloc(&d_C, elems_C * sizeof(float));
+    hipMemcpy(d_A, h_A, elems_A * sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(d_B, h_B, elems_B * sizeof(float), hipMemcpyHostToDevice);
+
+    int tiles_m = (M + TILE_M - 1) / TILE_M;
+    int tiles_n = (N + TILE_N - 1) / TILE_N;
+    dim3 grid(tiles_n, tiles_m, groups);
+    dim3 block(BLOCK_SIZE);
+
+    tiled_matmul_kernel<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
+    hipDeviceSynchronize();
+
+    if (verify) {
+        hipMemcpy(h_C, d_C, elems_C * sizeof(float), hipMemcpyDeviceToHost);
+        matmul_ref(h_A, h_B, h_ref, M, N, K, groups);
+        float max_err = 0.0f;
+        int errors = 0;
+        float tol = 5e-2f + K * 1e-3f;
+        for (size_t i = 0; i < elems_C; i++) {
+            float diff = fabsf(h_C[i] - h_ref[i]);
+            float denom = fmaxf(fabsf(h_ref[i]), 1e-6f);
+            float rel = diff / denom;
+            if (rel > max_err) max_err = rel;
+            if (rel > tol) errors++;
+        }
+        if (errors != 0) {
+            printf("%d errors (max_rel_err=%.6f tol=%.4f)\n", errors, max_err, tol);
+            free(h_A); free(h_B); free(h_C); free(h_ref);
+            hipFree(d_A); hipFree(d_B); hipFree(d_C);
+            return 1;
+        }
+    }
+
+    if (runs > 0) {
+        hipEvent_t start, stop;
+        hipEventCreate(&start);
+        hipEventCreate(&stop);
+
+        float total_ms = 0.0f;
+        for (int r = 0; r < runs; r++) {
+            hipEventRecord(start);
+            tiled_matmul_kernel<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
+            hipEventRecord(stop);
+            hipEventSynchronize(stop);
+            float ms;
+            hipEventElapsedTime(&ms, start, stop);
+            total_ms += ms;
+        }
+        float avg_ms = total_ms / runs;
+        double gflops = 2.0 * groups * M * N * K / (avg_ms * 1e-3) / 1e9;
+        printf("Average: %.3f ms, %.1f GFLOPS\n", avg_ms, gflops);
+
+        hipEventDestroy(start);
+        hipEventDestroy(stop);
+    }
+
+    printf("PASSED!\n");
+
+    hipFree(d_A); hipFree(d_B); hipFree(d_C);
+    free(h_A); free(h_B); free(h_C); free(h_ref);
+    return 0;
+}
diff --git a/External/HIP/pgo-tiled-matmul.reference_output b/External/HIP/pgo-tiled-matmul.reference_output
new file mode 100644
index 000000000000..391efdf648c3
--- /dev/null
+++ b/External/HIP/pgo-tiled-matmul.reference_output
@@ -0,0 +1,2 @@
+PASSED!
+exit 0
diff --git a/External/HIP/workload/pgo/test_pgo_matmul.sh.in b/External/HIP/workload/pgo/test_pgo_matmul.sh.in
new file mode 100644
index 000000000000..fb8e8cf6b6ba
--- /dev/null
+++ b/External/HIP/workload/pgo/test_pgo_matmul.sh.in
@@ -0,0 +1,118 @@
+#!/bin/bash
+# test_pgo_matmul.sh - Full offload PGO pipeline E2E test
+#
+# Tests the complete -fprofile-generate / -fprofile-use workflow for HIP:
+#   1. Build and run baseline (no PGO)
+#   2. Build instrumented binary, run to collect device profiles
+#   3. Merge profiles with llvm-profdata
+#   4. Build PGO-optimized binary, run and compare
+#
+# The test uses a tiled matmul kernel with 256 float accumulators per thread
+# that naturally exceeds the VGPR budget on AMDGPU, causing register spills.
+# PGO reduces spills on the hot (interior tile) path.
+
+set -e
+
+CLANG="@CMAKE_CXX_COMPILER@"
+CLANG_DIR=$(dirname "$CLANG")
+LLVM_PROFDATA="$CLANG_DIR/llvm-profdata"
+ROCM_PATH="@_RocmPath@"
+ARCH_FLAGS="@_ArchFlags@"
+SRC="@CMAKE_CURRENT_SOURCE_DIR@/pgo-tiled-matmul.hip"
+WORK_DIR="."
+
+# Problem size: non-tile-aligned to create boundary tiles
+M=1000 N=1000 K=500 GROUPS=4
+WARMUP=3
+BENCH_RUNS=5
+
+# Detect GPU architecture and set sub-tile size to induce register spills.
+# gfx950 has 512 VGPRs (vs 256 on gfx1100), so needs larger sub-tiles.
+GPU_ARCH=$("$CLANG_DIR/amdgpu-arch" 2>/dev/null | head -1 || echo "")
+TILE_FLAGS=""
+if echo "$GPU_ARCH" | grep -q "gfx950"; then
+    TILE_FLAGS="-DTH_M=20 -DTH_N=16"
+    log "gfx950 detected, using larger sub-tile (20x16=320 accumulators)"
+fi
+
+log() { echo "[pgo-test] $*"; }
+
+compile() {
+    local output=$1
+    local extra_flags=$2
+    "$CLANG" -O3 $ARCH_FLAGS -x hip \
+        --rocm-path="$ROCM_PATH" \
+        $TILE_FLAGS $extra_flags \
+        "$SRC" -o "$output" \
+        --hip-link -rtlib=compiler-rt -unwindlib=libgcc -frtlib-add-rpath \
+        2>&1 | grep -v "warning:" || true
+}
+
+run_kernel() {
+    local binary=$1
+    local runs=$2
+    local verify=${3:-0}
+    "$binary" $M $N $K $GROUPS $runs $verify
+}
+
+extract_avg_ms() {
+    grep "Average:" "$1" | awk '{print $2}'
+}
+
+# Step 1: Build baseline
+log "Step 1/6: Build baseline"
+compile "$WORK_DIR/baseline" ""
+
+# Step 2: Run baseline (warmup + benchmark)
+log "Step 2/6: Run baseline"
+run_kernel "$WORK_DIR/baseline" 1 1 > /dev/null  # verify correctness
+run_kernel "$WORK_DIR/baseline" $WARMUP 0 > /dev/null  # warmup
+run_kernel "$WORK_DIR/baseline" $BENCH_RUNS 0 > "$WORK_DIR/baseline_output.txt"
+BASELINE_AVG=$(extract_avg_ms "$WORK_DIR/baseline_output.txt")
+log "  Baseline average: $BASELINE_AVG ms"
+
+# Step 3: Build instrumented
+log "Step 3/6: Build instrumented"
+PROFILE_DIR="$WORK_DIR/pgo_profiles"
+rm -rf "$PROFILE_DIR"
+mkdir -p "$PROFILE_DIR"
+compile "$WORK_DIR/pgo_gen" "-fprofile-generate=$PROFILE_DIR"
+
+# Step 4: Collect profiles
+log "Step 4/6: Collect profiles"
+LLVM_PROFILE_FILE="$PROFILE_DIR/default_%m.profraw" \
+    run_kernel "$WORK_DIR/pgo_gen" 1 0 > /dev/null
+
+DEVICE_PROFS=$(find "$PROFILE_DIR" -name "*.gfx*.profraw" 2>/dev/null || true)
+if [ -z "$DEVICE_PROFS" ]; then
+    log "ERROR: No device profiles generated"
+    echo "PGO test FAILED: no device profiles"
+    exit 1
+fi
+
+# Step 5: Merge profiles and build PGO-optimized
+log "Step 5/6: Merge profiles and build PGO-optimized"
+"$LLVM_PROFDATA" merge -o "$WORK_DIR/device.profdata" $DEVICE_PROFS
+
+compile "$WORK_DIR/pgo_use" "-Xarch_device -fprofile-use=$WORK_DIR/device.profdata"
+
+# Step 6: Run PGO-optimized (warmup + benchmark)
+log "Step 6/6: Run PGO-optimized"
+run_kernel "$WORK_DIR/pgo_use" 1 1 > /dev/null  # verify correctness
+run_kernel "$WORK_DIR/pgo_use" $WARMUP 0 > /dev/null  # warmup
+run_kernel "$WORK_DIR/pgo_use" $BENCH_RUNS 0 > "$WORK_DIR/pgo_output.txt"
+PGO_AVG=$(extract_avg_ms "$WORK_DIR/pgo_output.txt")
+log "  PGO average: $PGO_AVG ms"
+
+# Report
+echo ""
+echo "=== PGO Tiled Matmul Results ==="
+echo "Baseline: $BASELINE_AVG ms"
+echo "PGO:      $PGO_AVG ms"
+
+if [ -n "$BASELINE_AVG" ] && [ -n "$PGO_AVG" ]; then
+    SPEEDUP=$(echo "scale=1; ($BASELINE_AVG - $PGO_AVG) / $BASELINE_AVG * 100" | bc 2>/dev/null || echo "N/A")
+    echo "Speedup:  ${SPEEDUP}% (positive = PGO faster)"
+fi
+
+echo "PGO test PASSED"
diff --git a/External/HIP/workload/pgo/verify_pgo_matmul.sh.in b/External/HIP/workload/pgo/verify_pgo_matmul.sh.in
new file mode 100644
index 000000000000..a00c75ba1962
--- /dev/null
+++ b/External/HIP/workload/pgo/verify_pgo_matmul.sh.in
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Verify the PGO matmul test output.
+# The test passes if the full PGO pipeline completed successfully
+# (correctness verified, all steps ran). Performance difference is
+# reported for information only.
+
+if grep -q "PGO test PASSED" "$1"; then
+    exit 0
+fi
+
+echo "PGO pipeline did not complete:"
+cat "$1"
+exit 1