diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 18b62b1..e06a6ab 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -25,6 +25,12 @@
       "source": "./skills/local-ai-use",
       "skills": "./",
       "description": "Route image generation, text-to-speech, and speech-to-text through a local AI Server to reduce token/cost usage."
+    },
+    {
+      "name": "rocm-doctor",
+      "source": "./skills/rocm-doctor",
+      "skills": "./",
+      "description": "Diagnose why ROCm, PyTorch, or llama.cpp isn't working on an AMD GPU. Matches the symptom against a fixed list of twelve known misconfigurations and proposes the next step."
     }
   ]
 }
diff --git a/README.md b/README.md
index 3c06e79..c43de71 100644
--- a/README.md
+++ b/README.md
@@ -200,7 +200,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for step-by-step instructions, the full a
 
 ## Status
 
-This repository is in its early days. In-repo skills include `skills/local-ai-app-integration/` and `skills/local-ai-use/`, seeding the **Application integration** focus area, and `skills/apu-memory-tuner/`, seeding the **Hardware-native** focus area. The remaining skills are being built out incrementally alongside manifests and CI. Expect rapid iteration.
+This repository is in its early days. In-repo skills include `skills/local-ai-app-integration/` and `skills/local-ai-use/`, seeding the **Application integration** focus area, and `skills/apu-memory-tuner/` and `skills/rocm-doctor/`, seeding the **Hardware-native** focus area. The remaining skills are being built out incrementally alongside manifests and CI. Expect rapid iteration.
 
 ## License
 
diff --git a/skills/rocm-doctor/SKILL.md b/skills/rocm-doctor/SKILL.md
new file mode 100644
index 0000000..f27ac1f
--- /dev/null
+++ b/skills/rocm-doctor/SKILL.md
@@ -0,0 +1,285 @@
+---
+name: rocm-doctor
+description: >-
+  Diagnoses why ROCm, the HIP SDK, PyTorch, or llama.cpp is broken on an
+  AMD GPU on Linux or Windows, and either applies a low-risk fix with
+  consent or hands back the exact next step. Also routes Lemonade, LM
+  Studio, and Ollama issues to the right upstream channel. Use when the
+  user reports that ROCm or HIP isn't working, torch.cuda.is_available()
+  is False Ryzen AI, rocminfo or hipInfo can't see the GPU,
+  or hits hipErrorNoBinaryForGpu,
+  HSA_STATUS_ERROR_INVALID_ISA, invalid device function, missing
+  amdhip64_6.dll, vcruntime140_1.dll, or libamdhip64.so, cannot open
+  /dev/kfd, ROCk module not loaded, an Adrenalin driver too old for the
+  HIP SDK, or a ROCm wheel that doesn't recognize gfx1151, gfx1150, or
+  gfx1103; or mentions HSA_OVERRIDE_GFX_VERSION,
+  HIP_VISIBLE_DEVICES, PYTORCH_ROCM_ARCH, render-group permissions,
+  amdgpu blacklist, Secure Boot, iGPU/dGPU collisions, or multi-GPU
+  hangs. Do not use for non-AMD GPUs, performance
+  tuning, or ROCm-on-WSL2.
+---
+
+# ROCm Doctor
+
+Given a "ROCm/PyTorch/llama.cpp isn't working on my AMD GPU" complaint,
+identify which **known misconfiguration** is the cause and either fix it
+or hand back the exact next step.
+
+This is a diagnose-and-fix skill, not a setup or tuning skill. The
+catalog of failure modes is a **closed list** that lives in
+`reference.md` and `scripts/diagnose.py`: if the user's symptom doesn't
+match one of them, the skill explicitly routes upstream rather than
+guessing. New failure modes get added by editing the catalog, not by
+the agent inventing them at runtime.
+
+## When to use this skill
+
+Use it when **any** of the following are true:
+
+- The user has an **AMD** GPU and a functional error with **PyTorch**,
+  **llama.cpp**, or anything else built directly against the system ROCm
+  (`/opt/rocm` or a pip wheel that bundles HIP). The skill examines the
+  host and diagnoses against the catalog.
+- The user is on **Lemonade**, **LM Studio**, or **Ollama**. These apps
+  ship their own ROCm and don't need a host-level examination, but the
+  user often doesn't know *where* to report the problem -- the skill
+  knows the right upstream channel for each (see
+  [Framework routing](#framework-routing)) and hands it over.
+
+Out of scope:
+
+- NVIDIA / Intel / Apple Silicon GPUs. Exit cleanly and tell the user.
+- Fresh installs on a clean machine. That's a setup task; point at
+  [`amdgpu-install`](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-overview.html)
+  (Linux) or the [HIP SDK installer](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
+  (Windows).
+- Pure performance complaints. Those belong in `mi-tuner` /
+  `omniperf-tune` / `apu-memory-tuner`.
+- **WSL2** (running Linux on top of Windows). The ROCm-on-WSL flow needs
+  Adrenalin Pro plus the WSL kernel update on the Windows host -- those
+  failure modes are not in this catalog. `examine.py` detects WSL via
+  `/proc/version` and exits 2 with a route-out message; if the user wants
+  WSL specifically, point them at <https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/install/installryz/wsl/howto_wsl.html>.
+
+## Prerequisites
+
+- **OS:** Linux **or** Windows (native). The catalog has 12 Linux entries
+  (5 of which are also valid on Windows) and 3 Windows-only entries; the
+  scripts pick the right subset for the host they run on.
+- **Linux tools the agent will invoke as part of examination** (best-effort;
+  the script degrades when one is missing):
+  - `lspci` (always present on desktop distros)
+  - `rocminfo` (when ROCm is installed)
+  - `journalctl` or `dmesg` (for amdgpu kernel-ring evidence)
+  - `python` / `python3` to introspect PyTorch
+  - `llama-cli` / `llama-server` / `main` to introspect llama.cpp
+- **Windows tools the agent will invoke as part of examination**:
+  - `powershell` (always present on Windows 10+) for `Get-CimInstance
+    Win32_VideoController` / `Win32_Processor` and the env-scope reads.
+  - `hipInfo.exe` from `%HIP_PATH%\bin` -- the Windows analog of `rocminfo`.
+    Absence is itself a signal (see `fix-13-hip-sdk-missing`).
+  - `setx` for env-var persistence and User-PATH edits (analog of editing
+    `~/.bashrc` on Linux).
+  - `python` to introspect PyTorch.
+- **Permissions:** examination is fully read-only and works as a regular
+  user on both OSes. Linux fixes that need `sudo` are flagged in their
+  recipe metadata; Windows fixes that touch the Machine env scope are
+  flagged similarly and `apply_fix.py` does NOT self-elevate -- the user
+  has to run an Administrator PowerShell when those are required.
+
+Silent footguns to surface explicitly when relevant:
+
+- `HSA_OVERRIDE_GFX_VERSION` -- forcing an unsupported gfx target works
+  for `rocminfo`/`hipInfo` but causes page faults at runtime. Diagnosis
+  `fix-2-unset-override` is the response when this is set on a GPU that
+  already has a native wheel; on Windows it can be persisted in either
+  the User or Machine env scope, so check both.
+- `HIP_VISIBLE_DEVICES` -- on dual-GPU systems (APU + dGPU) the iGPU is
+  often index 0 and destabilises HIP unless explicitly hidden.
+- `HIP_PATH` (Windows) -- if the user has multiple HIP SDK versions
+  installed under `C:\Program Files\AMD\ROCm\`, `HIP_PATH` decides which
+  one PyTorch / hipInfo actually loads. Pointing it at the wrong major
+  produces the same failure mode as `fix-8-wheel-rocm`.
+- `PYTORCH_ROCM_ARCH` -- only honored during a *build* of PyTorch. Setting
+  it at runtime does nothing for a prebuilt wheel.
+- `LD_LIBRARY_PATH` (Linux) -- a wheel-bundled `libamdhip64.so` shadowed
+  by a system one (or vice versa) gives confusing `cannot open shared
+  object file` errors that look like fix-8 but are really a load-order
+  bug. The Windows analog is `PATH` order: a stale HIP SDK bin directory
+  earlier on PATH than the one matching `HIP_PATH`.
+
+## The three-step flow
+
+Run these in order. The first two are read-only. The third asks before
+changing anything.
+
+```
+[ ] 1. Identify the framework, then examine (read-only).
+[ ] 2. Diagnose: match examination + symptom against the catalog.
+[ ] 3. Propose the fix; only apply with explicit consent; re-verify.
+```
+
+### Step 1: identify the framework and examine
+
+If the user hasn't said, ask which framework they are running. Use the
+`AskQuestion` tool with PyTorch / llama.cpp / Lemonade / LM Studio /
+Ollama / other as the options. The routing in [Framework routing](#framework-routing)
+keys off the answer.
+
+If the framework is in the "skip examination" bucket, jump straight to
+the upstream link and exit. Otherwise run:
+
+```bash
+python scripts/examine.py --framework pytorch --json > exam.json
+```
+
+Replace `pytorch` with `llama-cpp`, or pass `--framework auto` to let the
+script pick. Exit codes:
+
+| Exit | Meaning | Next action |
+|---|---|---|
+| 0 | Examined; AMD GPU present | Continue to Step 2. |
+| 2 | Wrong platform (WSL, neither Linux nor Windows, no AMD GPU) | Stop. Route the user. |
+| 3 | Probes partially failed | Continue but warn the user. |
+
+For a quick read-only summary without piping JSON, drop `--json`:
+
+```bash
+python scripts/examine.py --framework pytorch
+```
+
+`examine.py` collects exactly the facts the diagnosis catalog needs.
+On Linux: OS / kernel, AMD GPUs and gfx targets, `amdgpu` / `amdkfd`
+status, `/dev/kfd` ownership and group, user's group membership, system
+ROCm version and install method, framework version and arch list, the
+silent-footgun env vars, container/IOMMU state, and recent `amdgpu`
+kernel log lines. On Windows: AMD adapters and gfx targets via
+`Win32_VideoController` + `hipInfo.exe`, the HIP SDK install path and
+version, the Adrenalin / kernel-mode driver version, MSVC redistributable
+presence, and the same env-var snapshot. It deliberately does NOT spawn
+heavy probes (no kernel launches, no model downloads).
+
+### Step 2: diagnose
+
+Hand the JSON snapshot plus the user's error text to `diagnose.py`:
+
+```bash
+python scripts/diagnose.py --exam exam.json \
+  --symptom "HIP error: invalid device function on gfx1151"
+```
+
+The script runs every checker in the catalog, scores each from 0..100,
+and prints a ranked list. Each match has a stable `fix-N-...` id used by
+`apply_fix.py`.
+
+Score tiers:
+
+- `>= 75` (`HIGH`) -- propose the fix and (if auto-applicable) ask for
+  consent to apply it.
+- `>= 50` (`LIKELY`) -- describe the match and ask the user to confirm one
+  more piece of evidence before applying.
+- Below `50` -- print but do **not** act. If nothing scores `>= 50`, the
+  script exits 1 with a single-line route to the right upstream tracker.
+  Do not speculate.
+
+JSON output (`--json`) is the same data the agent should use programmatically:
+
+```bash
+python scripts/diagnose.py --exam exam.json --symptom "..." --json
+```
+
+### Step 3: apply the fix (with consent)
+
+Show the user the proposed fix (it's already printed by `diagnose.py`).
+If they consent, run:
+
+```bash
+python scripts/apply_fix.py --fix-id fix-4-render-group --dry-run
+python scripts/apply_fix.py --fix-id fix-4-render-group --yes
+```
+
+`--dry-run` prints the exact commands without executing. `--yes` skips
+the interactive `[y/N]` prompt (only pass this after the user has agreed
+in chat).
+
+A subset of fixes are auto-applicable; the rest are deliberately
+print-only because the risk of a half-applied state is too high for an
+agent to take. To see which is which without consulting `reference.md`:
+
+```bash
+python scripts/apply_fix.py --list
+```
+
+That prints every `fix-id` with an `AUTO` or `PRINT-ONLY` tag. Auto
+fixes are bounded operations like unsetting an env var, adding the user
+to a group, or appending a single line to a shell rc. Print-only fixes
+involve reinstalling frameworks, editing GRUB, regenerating the
+initramfs, or moving system repo files; those need a human at the
+keyboard.
+
+After every fix, re-run the `verify` command the recipe printed. Only
+declare success when the user's *original* failing command now succeeds
+(e.g. `torch.cuda.is_available()` returns `True`, `rocminfo` lists the
+GPU, the llama.cpp build runs).
+
+## Framework routing
+
+The skill's first decision is which framework the user runs. Some
+frameworks ship their own ROCm and bypass the system install; for those
+the right answer is "you're in the wrong place, here's where to file
+it", and the skill delivers that answer directly rather than running
+useless probes against the host.
+
+| Framework | Examine the host? | Action |
+|---|---|---|
+| PyTorch (Linux ROCm wheel) | Yes | `python scripts/examine.py --framework pytorch`, then `diagnose.py`. |
+| PyTorch (Windows TheRock wheel) | Yes | Same scripts; on Windows the catalog filters to Linux+Windows + Windows-only entries. |
+| llama.cpp (built against system ROCm/HIP SDK) | Yes | `python scripts/examine.py --framework llama-cpp`, then `diagnose.py`. |
+| Lemonade | No -- ships its own ROCm | Route to <https://github.com/lemonade-sdk/lemonade/issues> and the Lemonade [Discord](https://discord.gg/5xXzkMu8Zk). |
+| LM Studio | No -- ships its own runtime | Route to <https://lmstudio.ai/docs/app> (in-app support; no public repo). |
+| Ollama | No -- ships its own runtime | Route to <https://github.com/ollama/ollama/issues> and the Ollama Discord. |
+| vLLM / SGLang | Out of scope until phase 1+ | Route to the project's own issue tracker. |
+
+If a Lemonade / LM Studio / Ollama user *does* have a host-level ROCm
+problem (rare), it shows up when their app fails AND a standalone
+`rocminfo` (Linux) / `hipInfo.exe` (Windows) also fails. Only then
+escalate to the full examination.
+
+## Safety rules
+
+- Read-only by default. Examination and diagnosis never change state.
+- Always print before applying. `apply_fix.py` shows every command before
+  asking for consent, even with `--yes`.
+- Never reboot, never touch BIOS, never flash firmware.
+- Never reinstall system packages without an interactive prompt or `--yes`.
+- Never set `HSA_OVERRIDE_GFX_VERSION` as the *first* fix when a native
+  wheel exists. That is `fix-2-unset-override`'s entire reason for being.
+- Never silently fall back to a different fix when the requested one
+  isn't applicable. Exit 3 and tell the user why.
+- When nothing in the catalog matches, **do not speculate**. Hand the
+  user the upstream tracker URL from `diagnose.py --json`.
+
+## Verification checklist
+
+Mark this skill complete only when **all** are true:
+
+- [ ] `python scripts/examine.py` exits 0 (or 3 with the user's explicit
+      go-ahead to continue despite a partial probe).
+- [ ] `python scripts/diagnose.py --exam exam.json --symptom "..."` exits 0
+      and surfaced exactly one HIGH-confidence diagnosis, OR it exited 1
+      and the user has been routed to the right upstream tracker.
+- [ ] If a fix was applied: the recipe's `verify` command exits cleanly.
+- [ ] The user's *original* failing command now succeeds end-to-end (run
+      it again in their original shell).
+- [ ] If any fix needed a re-login or reboot, the user has actually done
+      it before declaring success.
+
+If any box is unchecked, the failure isn't resolved -- say so out loud
+rather than declaring victory.
+
+## Reference
+
+For the full catalog of known misconfigurations, every fix-id and its
+verify command, the silent-footgun env-var reference, and the
+upstream-routing table in machine-readable form, see
+[reference.md](reference.md).
diff --git a/skills/rocm-doctor/reference.md b/skills/rocm-doctor/reference.md
new file mode 100644
index 0000000..07d4120
--- /dev/null
+++ b/skills/rocm-doctor/reference.md
@@ -0,0 +1,363 @@
+# ROCm Doctor -- Reference
+
+Detailed background for the `rocm-doctor` skill. Read this only when the
+three-step flow in `SKILL.md` doesn't cover a decision.
+
+## Contents
+
+- [The known-misconfigurations catalog](#the-known-misconfigurations-catalog)
+- [Silent-footgun environment variables](#silent-footgun-environment-variables)
+- [Windows-specific footguns](#windows-specific-footguns)
+- [Framework support matrix](#framework-support-matrix)
+- [Device support, phased](#device-support-phased)
+- [Live AMD compatibility matrices](#live-amd-compatibility-matrices)
+- [Wheel index reference](#wheel-index-reference)
+- [Upstream routing](#upstream-routing)
+- [Why we do not auto-set HSA_OVERRIDE_GFX_VERSION](#why-we-do-not-auto-set-hsa_override_gfx_version)
+- [Why WSL is out of scope](#why-wsl-is-out-of-scope)
+- [Adjacent problem: matrices in hand-typed tables](#adjacent-problem-matrices-in-hand-typed-tables)
+
+---
+
+## The known-misconfigurations catalog
+
+The closed list `diagnose.py` checks against. Each row maps to one
+`fix-N-...` recipe in `apply_fix.py`. **If a user's symptom doesn't
+match any of these, the skill must not speculate** -- it exits 1 and
+prints the upstream tracker URL from `_route_when_no_match`.
+
+This catalog grows over time. To add a new failure mode: add a
+`check_N_*` function to `scripts/diagnose.py`, a `FixRecipe` with the
+matching `fix-id` to `scripts/apply_fix.py`'s `RECIPES`, and a row to
+the table below. The decision-tree contract -- score 0..100, emit the
+recipe's `verify` command on a hit, exit 1 + route upstream on a miss --
+stays the same regardless of catalog size.
+
+| # | fix-id | OS | Failure pattern | Typical signal | Default fix |
+|---|---|---|---|---|---|
+| 1 | `fix-1-arch` | both | GPU `gfx` target not in framework's compiled arch list | `hipErrorNoBinaryForGpu`, `HIP error: invalid device function`, `HSA_STATUS_ERROR_INVALID_ISA`, `torch.cuda.get_arch_list()` missing the GPU's gfx | Reinstall the framework from a wheel index that ships kernels for the GPU's gfx (TheRock per-gfx wheels are the recommended fallback, and the only first-party option on Windows AMD). |
+| 2 | `fix-2-unset-override` | both | `HSA_OVERRIDE_GFX_VERSION` set on a GPU that has a native wheel | Hangs, `amdgpu: page fault` in `dmesg`, `OUT_OF_REGISTERS` from the compiler | Linux: `unset HSA_OVERRIDE_GFX_VERSION` and remove from shell rc. Windows: `setx HSA_OVERRIDE_GFX_VERSION ""`, plus check the Machine env scope. |
+| 3 | `fix-3-rocm-kernel` | linux | ROCm <-> distro/kernel forms an unsupported triple | `amdgpu-install` DKMS build fails; `amdgpu` not loaded after install | Cross-check the live AMD compatibility matrix; install matching HWE kernel; consider `--no-dkms`. |
+| 4 | `fix-4-render-group` | linux | User not in `render` / `video` groups, or `/dev/kfd` group is wrong | `Unable to open /dev/kfd: Operation not permitted`; `rocminfo` works under `sudo` but not as user | `sudo usermod -a -G render,video "$USER"`; log out/in. |
+| 5 | `fix-5-amdgpu-load` | linux | `amdgpu` kernel module not loaded or blacklisted | `rocminfo` says "ROCk module is NOT loaded"; `lsmod \| grep amdgpu` empty; blacklist line in `/etc/modprobe.d/*` | Remove blacklist; `update-initramfs -u`; `modprobe amdgpu`; check Secure Boot. |
+| 6 | `fix-6-path` | both | ROCm/HIP binaries not on `PATH` after install | `rocminfo: command not found` (Linux) or `hipInfo.exe` not in `%PATH%` (Windows) immediately after a clean install | Linux: append `/opt/rocm/bin` to `PATH` in the shell rc. Windows: `setx PATH "%PATH%;C:\Program Files\AMD\ROCm\<ver>\bin"` and reopen the shell. |
+| 7 | `fix-7-stale-repos` | linux | Stale / conflicting APT or DNF repos from prior installer runs | `404` on `repo.radeon.com`, "Release file not valid", mixed-version packages | Quarantine duplicate repo files in `/etc/apt/sources.list.d/`; re-run `apt update` cleanly. |
+| 8 | `fix-8-wheel-rocm` | both | Framework wheel built for a different ROCm/HIP major than the system | Linux: `libamdhip64.so.X: cannot open shared object file`. Windows: `amdhip64_X.dll could not be found` / `DLL load failed`. | Reinstall the framework from the index matching the system ROCm/HIP SDK major (or upgrade the system to match). |
+| 9 | `fix-9-igpu-dgpu` | both | iGPU enumerated alongside dGPU and destabilising the runtime | Random crashes / segfaults on systems with both an APU and a dGPU | Linux: `export HIP_VISIBLE_DEVICES=<dGPU-index>`. Windows: `setx HIP_VISIBLE_DEVICES <dGPU-index>` and reopen the shell. |
+| 10 | `fix-10-container` | linux | Container can't see `/dev/kfd` or `/dev/dri/renderD*` | `rocminfo` inside container fails with permission denied; host works | Re-launch with `--device=/dev/kfd --device=/dev/dri --group-add render`; on rootless podman also `--userns=keep-id`. |
+| 11 | `fix-11-iommu` | linux | Multi-GPU hang when IOMMU is in default 'on' mode | First multi-GPU job hangs indefinitely | Add `iommu=pt` to the kernel cmdline; reboot. |
+| 12 | `fix-12-installer` | linux | `amdgpu-install` left a half-configured state | Subsequent `apt update` errors; `dpkg` complains about half-configured packages; `--accept-eula` repo regression | Run the documented uninstall sequence, then reinstall without the offending flag. |
+| 13 | `fix-13-hip-sdk-missing` | windows | Framework links HIP but the HIP SDK isn't installed on this host | `amdhip64_X.dll not found`, `Could not find HIP`, `hipInfo` is not a command, `HIP_PATH` unset | Install the AMD HIP SDK matched to the framework's HIP major: <https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html> |
+| 14 | `fix-14-adrenalin-too-old` | windows | Adrenalin / kernel-mode driver older than the HIP SDK pairs with | HIP SDK installed but `hipInfo.exe` reports no agents; `driver too old` style errors | Update Adrenalin from <https://www.amd.com/en/support>; cross-check the SDK release notes for the exact pairing; reboot. |
+| 15 | `fix-15-msvc-redist` | windows | MSVC 2015-2022 runtime DLL missing -- HIP DLLs cannot load | `vcruntime140.dll` / `vcruntime140_1.dll` missing dialog; `api-ms-win-crt-*.dll` errors | Install the VC++ redistributable: <https://aka.ms/vs/17/release/vc_redist.x64.exe>. |
+
+For the exact heuristics each checker uses (state signals vs. symptom
+keyword weights), see the per-function comments in `scripts/diagnose.py`.
+
+## Silent-footgun environment variables
+
+These four change ROCm/HIP behaviour without printing a warning. Each one
+gets a named callout in this section because they account for a
+disproportionate share of "ROCm doesn't work" reports.
+
+### `HSA_OVERRIDE_GFX_VERSION`
+
+Tells HSA to advertise a different `gfx` target to user-space than the
+kernel actually has. Useful in exactly one situation: when no
+framework wheel ships kernels for your real gfx and a close-enough gfx
+exists. Outside that case it causes page faults at runtime because the
+compiler emits ISA for the override target but the hardware executes a
+different ISA.
+
+The doctor's default response when this variable is set on a GPU that
+*does* have a native wheel is `fix-2-unset-override`, which:
+
+1. Tells the user the variable is set.
+2. Suggests `unset HSA_OVERRIDE_GFX_VERSION`.
+3. Greps the user's shell rc files for persistent exports and points
+   them at the lines to delete.
+
+It deliberately does not edit the user's dotfiles. Editing someone
+else's `~/.bashrc` is too easy to get wrong and too easy to forget you
+did.
+
+### `HIP_VISIBLE_DEVICES` / `ROCR_VISIBLE_DEVICES`
+
+The HIP / HSA equivalents of `CUDA_VISIBLE_DEVICES`. They restrict which
+agents the runtime enumerates, by integer index in `rocminfo` order.
+Setting either to `0,1` does not change anything on a single-GPU box but
+matters on dual-GPU boxes (APU + dGPU, or two dGPUs).
+
+The doctor uses `HIP_VISIBLE_DEVICES` (not `ROCR_VISIBLE_DEVICES`)
+because both ROCm and PyTorch honour it; PyTorch also honours
+`CUDA_VISIBLE_DEVICES` as an alias on HIP builds, which surprises
+users who set both to different values. If both are set, the agent
+should ask the user to pick one and unset the other.
+
+### `PYTORCH_ROCM_ARCH`
+
+A **build-time** variable, not a runtime one. Used when compiling
+PyTorch from source to select which `gfx` targets the wheel will ship
+kernels for. Setting it at runtime against a prebuilt wheel does
+nothing; the wheel's arch list was baked at build time.
+
+The agent should treat `PYTORCH_ROCM_ARCH` in a user's runtime shell as
+a tell that the user has been pasting recipes from the wrong tutorial.
+It is not a fix; it is misinformation.
+
+### `LD_LIBRARY_PATH`
+
+Frameworks that bundle their own HIP (most pip wheels) ship a private
+`libamdhip64.so.X`. If the user has `LD_LIBRARY_PATH` pointing at a
+system `/opt/rocm/lib` that contains a different major version, the
+loader may pick the wrong one and the import fails with `cannot open
+shared object file` or `version 'X' not found`. This LOOKS like
+`fix-8-wheel-rocm` (wheel/ROCm major mismatch) but the underlying cause
+is a load-order conflict.
+
+If `examine.py` reports `hip_libs_on_ld_path=true` and the framework
+also bundles HIP, suggest unsetting `LD_LIBRARY_PATH` and re-running the
+import before reinstalling anything.
+
+## Windows-specific footguns
+
+Windows uses different mechanisms for the same failure modes Linux has;
+keep the analogies straight rather than transplanting Linux fixes.
+
+### `HIP_PATH` and multiple HIP SDK installs
+
+The HIP SDK installer drops files under
+`C:\Program Files\AMD\ROCm\<version>\` and sets `HIP_PATH` (and a
+versioned `HIP_PATH_<ver>`) in the user/machine env. Multiple SDKs can
+coexist on disk; whichever `HIP_PATH` points at is the one PyTorch and
+`hipInfo.exe` actually load. Pointing it at the wrong major has the same
+end result as `fix-8-wheel-rocm` -- `amdhip64_X.dll` from the SDK's `bin`
+directory has the wrong major number for the installed framework.
+
+`examine.py` records the `HIP_PATH` env var alongside the discovered SDK
+install path. When they disagree (`HIP_PATH` is set but `hip_sdk_path`
+points at a different directory), surface both values to the user and let
+them decide which one is right before any other fix.
+
+### PATH ordering on Windows
+
+Windows uses PATH for DLL search; there is no `LD_LIBRARY_PATH` analog.
+If the user has more than one `...\AMD\ROCm\<ver>\bin` on PATH, the first
+one wins for DLL resolution, which can be a different SDK than `HIP_PATH`
+points at. The signal is the same as Linux's load-order conflict: a
+`cannot find amdhip64_X.dll` error that doesn't go away after reinstalling
+the right SDK.
+
+### Adrenalin pairing
+
+The user-space HIP SDK and the kernel-mode driver (Adrenalin / Adrenalin
+Pro) have to match. AMD bumps the supported pairing every HIP SDK
+release; the live table is in
+<https://rocm.docs.amd.com/projects/install-on-windows/en/latest/install/install.html>.
+We deliberately do NOT hardcode a minimum Adrenalin version in
+`diagnose.py` -- the table goes stale within months. `fix-14-adrenalin-too-old`
+triggers on observable failure (HIP SDK present + `hipInfo.exe` cannot
+enumerate, or matching keyword in the user's symptom) and routes the user
+to the live page.
+
+### MSVC redistributable
+
+The HIP SDK's `amdhip64_*.dll` links against the MSVC 2015-2022 runtime
+(`vcruntime140.dll`, `vcruntime140_1.dll`). Without the redistributable,
+`import torch` fails with a missing-DLL dialog that points at
+`vcruntime140_1.dll`, not at the HIP runtime. `fix-15-msvc-redist` is
+specifically the path for this -- do NOT route it to `fix-8-wheel-rocm`
+even though the surface error involves a missing DLL.
+
+### `setx` does not affect open shells
+
+Both `apply_fix.py`'s Windows runners and the recipe `commands` use
+`setx` to persist env vars. `setx` writes to the User registry but does
+NOT update the current process or already-open shells. After running any
+`setx`-based fix, instruct the user to close and reopen the terminal
+before re-verifying.
+
+## Framework support matrix
+
+The skill's first decision is which framework the user is running. Only
+the "yes" rows trigger system examination; the "no" rows route upstream
+without running any local probes.
+
+| Framework | Examine the system? | Action |
+|---|---|---|
+| **PyTorch** (Linux ROCm wheels) | Yes | `python scripts/examine.py --framework pytorch` followed by `scripts/diagnose.py`. |
+| **PyTorch** (Windows TheRock wheels) | Yes | Same scripts; on Windows `diagnose.py` filters the catalog to the cross-platform + Windows-only entries. |
+| **llama.cpp** (built against system ROCm/HIP SDK) | Yes | `python scripts/examine.py --framework llama-cpp` followed by `scripts/diagnose.py`. |
+| **Lemonade** | No -- ships its own ROCm | Route to <https://github.com/lemonade-sdk/lemonade> + [Discord](https://discord.gg/5xXzkMu8Zk). |
+| **LM Studio** | No -- ships its own runtime | Route to <https://lmstudio.ai/docs/app> + Discord (in-app support, no public repo). |
+| **Ollama** | No -- ships its own runtime | Route to <https://github.com/ollama/ollama> + Discord. |
+| **vLLM** | Out of scope until phase 1+ | Route to <https://github.com/vllm-project/vllm/issues>. |
+| **SGLang** | Out of scope until phase 1+ | Route to <https://github.com/sgl-project/sglang/issues>. |
+
+If a Lemonade / LM Studio / Ollama user reports a problem AND a
+standalone `rocminfo` (Linux) / `hipInfo.exe` (Windows) also fails (i.e.
+the issue is the host install, not the bundled runtime), only then
+escalate to a full examination. That is rare; the default action is
+still to route upstream.
+
+## Device support, phased
+
+The skill ships in three phases. Phase 0 is the only one validated end
+to end; later phases reuse the same scripts but loosen heuristics in
+`diagnose.py`.
+
+| Phase | GPUs | Status |
+|---|---|---|
+| 0 | Ryzen AI APUs (Strix Halo, Strix Point, Krackan, Phoenix, Hawk Point) -- gfx1151 / gfx1150 / gfx1103 / gfx1036 | Validated. Default target. |
+| 1 | Instinct (MI300X, MI300A, MI250, MI210) -- gfx942 / gfx90a | Scripts work; not validated against the full failure list. |
+| 2 | Radeon dGPUs (RDNA3, RDNA4) -- gfx1100, gfx1101, gfx1102, gfx12xx | Scripts work; iGPU/dGPU collision logic specifically targets this case. |
+
+## Live AMD compatibility matrices
+
+Hand-typed kernel/ROCm/distro matrices in skill bodies go stale within
+months. Always fetch live from these pages instead of inlining them:
+
+- **ROCm Linux system requirements** (kernel ranges, distro versions,
+  Python versions): <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html>
+- **ROCm release compatibility matrix** (per-release driver / framework
+  versions): <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>
+- **RDNA3.5 system optimization** (APU-specific kernel notes referenced
+  by `apu-memory-tuner`): <https://rocm.docs.amd.com/en/latest/how-to/system-optimization/rdna3-5.html>
+
+`diagnose.py`'s `fix-3-rocm-kernel` recipe always links to the first
+page rather than asserting a fixed kernel floor. The same goes for
+wheel-index URLs in `fix-1-arch` and `fix-8-wheel-rocm`.
+
+## Wheel index reference
+
+For `fix-1-arch` and `fix-8-wheel-rocm`, prefer indexes in this order:
+
+### Linux
+
+1. **Official PyTorch ROCm wheels** -- `https://download.pytorch.org/whl/rocm6.4`
+   (stable) and `https://download.pytorch.org/whl/nightly/rocm6.4` (nightly).
+   Replace `6.4` with the user's system ROCm major.
+2. **TheRock per-gfx wheels** -- <https://github.com/ROCm/TheRock>.
+   The recommended fallback when the official index doesn't yet cover
+   a gfx (typically true for newly released APUs in the first 2-3 ROCm
+   releases after launch).
+3. **Build from source** -- last resort. Pin `PYTORCH_ROCM_ARCH=<gfx>`
+   at build time, not at runtime. See the PyTorch ROCm build guide.
+
+### Windows
+
+1. **TheRock Windows wheels** -- <https://github.com/ROCm/TheRock>. The
+   live source of truth for which gfx targets are supported on Windows
+   right now and which HIP SDK major each wheel pairs with. Always pull
+   the install command from the project README rather than asserting a
+   fixed `--index-url` here.
+2. **Build from source** -- last resort. Requires Visual Studio Build
+   Tools, the HIP SDK on PATH, and `HIP_PATH` set. See the PyTorch ROCm
+   build guide for the Windows-specific environment variables.
+
+For llama.cpp:
+
+```bash
+# Linux:
+cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=<gfx_target>
+cmake --build build -j
+```
+
+```powershell
+# Windows: needs the HIP SDK installed and HIP_PATH set; targets MSVC.
+cmake -B build -G "Visual Studio 17 2022" -DGGML_HIP=ON `
+  -DAMDGPU_TARGETS=<gfx_target>
+cmake --build build --config Release
+```
+
+`AMDGPU_TARGETS` accepts a semicolon-separated list. Build a fat binary
+for multiple GPUs with `-DAMDGPU_TARGETS=gfx1100;gfx1151`.
+
+## Upstream routing
+
+When `diagnose.py` returns no matches (exit 1), route the user to
+exactly one upstream tracker rather than guessing. The mapping
+`UPSTREAM_TRACKERS` in `diagnose.py` is the source of truth; the
+abbreviated version:
+
+| Framework | Tracker |
+|---|---|
+| PyTorch | <https://github.com/pytorch/pytorch/issues> (tag with `rocm`) |
+| llama.cpp | <https://github.com/ggml-org/llama.cpp/issues> |
+| Lemonade | <https://github.com/lemonade-sdk/lemonade/issues> |
+| Ollama | <https://github.com/ollama/ollama/issues> |
+| LM Studio | <https://lmstudio.ai/docs/app> (in-app support) |
+| ROCm core (default) | <https://github.com/ROCm/ROCm/issues> |
+
+Always attach the JSON from `python scripts/examine.py --json` to the
+upstream report. It contains the kernel, GPU(s), ROCm version, install
+method, framework version, and the env-var snapshot that the upstream
+maintainer would otherwise have to ask for.
+
+## Why we do not auto-set `HSA_OVERRIDE_GFX_VERSION`
+
+This deserves its own callout because every other "ROCm not working"
+tutorial on the internet suggests it as the first fix. We deliberately
+suggest it last.
+
+`HSA_OVERRIDE_GFX_VERSION` works by tricking HSA into reporting the
+override gfx string to user space. The compiler then emits ISA for the
+*override* target. The hardware still executes the ISA it natively
+supports. When the two are close (e.g. gfx1100 → gfx1030) most kernels
+run; when they differ in subtle ways (register count, LDS layout, queue
+size) you get OUT_OF_REGISTERS, page faults, or silently wrong results.
+
+Per the SCOPE document's success criteria:
+
+> The skill never proposes `HSA_OVERRIDE_GFX_VERSION` as the *first*
+> fix when a native wheel exists for the user's `gfx` target.
+
+`diagnose.py`'s `fix-1-arch` recipe lists the override only in the notes
+field, marked as a fallback when no native wheel exists. The auto-applied
+path (`fix-2-unset-override`) is the OPPOSITE direction: removing the
+override when the user already has one set unnecessarily.
+
+## Why WSL is out of scope
+
+`examine.py` detects WSL2 (via `microsoft` in `/proc/version` or
+`WSL_DISTRO_NAME` in the environment) and exits 2 with a route-out
+message. It does this on purpose: ROCm-on-WSL has its own failure modes
+that are NOT in this catalog, and pretending they are Linux-native bugs
+just gives users wrong fixes.
+
+What's actually different on WSL:
+
+- The kernel-mode driver lives on the **Windows host**, not in WSL. The
+  user needs a recent Adrenalin Pro / Adrenalin install on the host, plus
+  the WSL kernel update. None of those touch the WSL distro.
+- `/dev/kfd` is replaced by `/dev/dxg` (the DirectX-on-WSL passthrough);
+  the `fix-4-render-group` and `fix-5-amdgpu-load` checks are wrong for
+  the wrong reasons.
+- The HIP runtime libraries are loaded via `/usr/lib/wsl/lib/` rather
+  than `/opt/rocm/lib`, so an `LD_LIBRARY_PATH` debug session is
+  qualitatively different.
+
+If a WSL user really does need a host-level ROCm fix, the right path is
+the WSL install guide:
+<https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/install/installryz/wsl/howto_wsl.html>. Once
+those WSL-specific prereqs are in place, the user is back to running
+either pure Windows (this skill) or pure native Linux (this skill); WSL
+itself stays out of scope.
+
+## Adjacent problem: matrices in hand-typed tables
+
+Most of what this skill needs (supported GPUs, kernel ranges, ROCm
+releases, wheel arch lists, gfx families) is scattered across hand-typed
+tables in docs pages, READMEs, and release notes. Everyone re-parses the
+same matrix, and they drift.
+
+The real fix is bigger than this skill: ROCm wants a **single,
+agent-friendly source of truth** that feeds both the docs and skills like
+`rocm-doctor`. Until that exists, the scripts here scrape
+`rocm.docs.amd.com` at run time (`fix-3-rocm-kernel` links to the live
+page rather than asserting a version) and the skill body is careful not
+to assert a matrix that will be wrong in 90 days.
+
+When ROCm ships that source of truth, `examine.py` and `diagnose.py`
+should switch to it. Until then, prefer "here is the live URL" over
+"the supported kernels as of this writing are".
diff --git a/skills/rocm-doctor/scripts/apply_fix.py b/skills/rocm-doctor/scripts/apply_fix.py
new file mode 100644
index 0000000..71f8a43
--- /dev/null
+++ b/skills/rocm-doctor/scripts/apply_fix.py
@@ -0,0 +1,978 @@
+#!/usr/bin/env -S uv run --quiet
+# /// script
+# requires-python = ">=3.10"
+# dependencies = []
+# ///
+"""Apply a low-risk fix proposed by `diagnose.py`, or print the plan.
+
+This is the ONLY rocm-doctor script that can change the system. Every
+diagnosis from `diagnose.py` carries a stable `fix_id`; pass it here:
+
+    python scripts/apply_fix.py --fix-id fix-4-render-group
+    python scripts/apply_fix.py --fix-id fix-2-unset-override --dry-run
+    python scripts/apply_fix.py --list
+
+`--dry-run` is the default safety hatch: it prints the planned commands
+and exits 0 without executing anything. Use it to show the user exactly
+what would change.
+
+When a fix has `auto_applicable=False` (most of the structural fixes:
+kernel-module blacklist, repo cleanup, multi-GPU IOMMU, amdgpu-install
+rebuild), this script prints the commands and exits 0 without running
+them, even without `--dry-run`. The user has to copy-paste, because the
+risk of a half-applied state is too high for a tool to take.
+
+Each recipe carries an `applies_on` set of os_family values. `main` refuses
+with exit 3 when the running OS isn't in that set, replacing the per-runner
+platform.system() checks. Linux-only recipes (fix-3, -4, -5, -7, -10, -11,
+-12) refuse on Windows; Windows-only recipes (fix-13, -14, -15) refuse on
+Linux; the rest are cross-platform.
+
+Exit codes:
+  0 = success (or dry-run finished, or fix is advisory-only).
+  2 = unknown --fix-id.
+  3 = required environment is missing (e.g. fix needs `sudo` and there's no
+      sudo, or fix doesn't apply to the running OS).
+  4 = the underlying command exited non-zero; nothing was rolled back.
+  5 = user declined the change at the interactive prompt.
+
+Design constraints:
+  - Never run anything `sudo` without printing the command first.
+  - Never modify the Windows registry, BIOS, or kernel cmdline non-interactively.
+  - Never restart services or reboot the machine.
+  - Never reinstall packages without an explicit --yes flag.
+  - Never silently fall through to an unrelated fix because the requested
+    one wasn't applicable -- exit 3 and tell the user why.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import platform
+import re
+import shutil
+import subprocess
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class FixRecipe:
+    fix_id: str
+    title: str
+    rationale: str
+    auto_applicable: bool          # True iff we can run the commands ourselves
+    commands: list[str] = field(default_factory=list)
+    needs_sudo: bool = False
+    needs_reboot: bool = False
+    needs_relogin: bool = False
+    verify: str = ""
+    notes: list[str] = field(default_factory=list)
+    # OS families this recipe applies on. `main` refuses (exit 3) when the
+    # running OS isn't in this set, replacing the per-runner platform.system()
+    # checks that used to live in each runner.
+    applies_on: frozenset[str] = field(default_factory=lambda: frozenset({"linux"}))
+    # When auto_applicable, this callable runs the actual change. It's
+    # invoked with (args, recipe) and must return an int exit code. We
+    # split this off from `commands` so we can compose multi-step actions
+    # (e.g. usermod followed by checking the resulting group list) without
+    # shelling out to bash.
+    runner: object = None          # Callable[[argparse.Namespace, FixRecipe], int]
+
+
+def _run(cmd: list[str], timeout: float = 60.0) -> tuple[int, str, str]:
+    try:
+        r = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout, check=False,
+        )
+        return r.returncode, r.stdout or "", r.stderr or ""
+    except (FileNotFoundError, subprocess.SubprocessError, OSError) as exc:
+        return 127, "", str(exc)
+
+
+def _have(cmd: str) -> bool:
+    return shutil.which(cmd) is not None
+
+
+def _confirm(prompt: str, assume_yes: bool) -> bool:
+    if assume_yes:
+        return True
+    if not sys.stdin.isatty():
+        # Non-interactive context (CI, agent harness). Refuse to apply
+        # without explicit --yes; printing the plan is enough.
+        print("Non-interactive shell and --yes not passed; refusing to apply.")
+        return False
+    try:
+        ans = input(f"{prompt} [y/N]: ").strip().lower()
+    except EOFError:
+        return False
+    return ans in ("y", "yes")
+
+
+def _print_recipe(r: FixRecipe) -> None:
+    print(f"Fix:        {r.fix_id}  -- {r.title}")
+    print(f"OS scope:   {', '.join(sorted(r.applies_on))}")
+    print(f"Rationale:  {r.rationale}")
+    if r.commands:
+        print("Commands:")
+        for c in r.commands:
+            print(f"  $ {c}")
+    flags = []
+    if r.needs_sudo: flags.append("requires sudo")
+    if r.needs_reboot: flags.append("requires reboot")
+    if r.needs_relogin: flags.append("requires re-login")
+    if not r.auto_applicable: flags.append("manual only (apply_fix.py will NOT run it)")
+    if flags:
+        print(f"Flags:      {', '.join(flags)}")
+    for n in r.notes:
+        print(f"Note:       {n}")
+    if r.verify:
+        print(f"Verify:     {r.verify}")
+
+
+# ---------------------------------------------------------------------------
+# Runners. One per auto-applicable fix.
+#
+# Each runner returns the process exit code. It must:
+#   - Refuse to act when the platform isn't right (return 3).
+#   - Print every command it runs.
+#   - Respect args.dry_run.
+#   - Respect args.yes (skip the interactive confirm).
+# ---------------------------------------------------------------------------
+
+def run_render_group(args, recipe: FixRecipe) -> int:
+    """fix-4: add the current user to the render group (and 'video' for safety)."""
+    user = os.environ.get("USER") or os.environ.get("LOGNAME") or ""
+    if not user:
+        print("Could not determine current user from $USER/$LOGNAME.")
+        return 3
+    if not _have("usermod"):
+        print("`usermod` not on PATH; cannot add groups.")
+        return 3
+    if not _have("sudo") and os.geteuid() != 0:
+        print("`sudo` is not on PATH and we are not root; cannot add groups.")
+        return 3
+
+    cmd_prefix = [] if os.geteuid() == 0 else ["sudo"]
+    cmd = cmd_prefix + ["usermod", "-a", "-G", "render,video", user]
+    print("Will run:", " ".join(cmd))
+    if args.dry_run:
+        print("(dry-run; not executed)")
+        return 0
+    if not _confirm("Add user to render,video groups?", args.yes):
+        return 5
+    rc, out, err = _run(cmd, timeout=20)
+    if out: sys.stdout.write(out)
+    if err: sys.stderr.write(err)
+    if rc != 0:
+        print(f"usermod exited {rc}; group membership NOT changed.")
+        return 4
+    print(f"Added {user} to render,video.")
+    print(
+        "IMPORTANT: log out and back in (or reboot) for the membership to "
+        "take effect in new shells and services. `newgrp render` patches "
+        "the current shell only."
+    )
+    return 0
+
+
+def run_unset_override(args, recipe: FixRecipe) -> int:
+    """fix-2: unset HSA_OVERRIDE_GFX_VERSION for future shells.
+
+    We can only affect THIS process. Persisting the unset requires editing
+    user dotfiles (Linux) or the per-user environment registry (Windows),
+    which we never do unannounced. We instead:
+      Linux:
+        1. Inspect ~/.bashrc, ~/.zshrc, ~/.profile, ~/.config/fish/config.fish
+           for an `export HSA_OVERRIDE_GFX_VERSION=...` line.
+        2. Print exact $EDITOR instructions for any hit.
+      Windows:
+        1. Read the User and Machine env scopes via PowerShell.
+        2. Tell the user which scope still holds the value and how to clear
+           it (`setx HSA_OVERRIDE_GFX_VERSION ""` or System Properties UI).
+    """
+    if platform.system().lower() == "windows":
+        return _run_unset_override_windows(args, recipe)
+    return _run_unset_override_linux(args, recipe)
+
+
+def _run_unset_override_linux(args, recipe: FixRecipe) -> int:
+    current = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
+    if not current:
+        print("HSA_OVERRIDE_GFX_VERSION is already unset in this shell.")
+    else:
+        print(f"HSA_OVERRIDE_GFX_VERSION={current} is set in this shell.")
+        print("In your current shell, run:")
+        print("  unset HSA_OVERRIDE_GFX_VERSION")
+        print("(This script can't unset it in your parent shell; it only sees a copy.)")
+
+    candidates = [
+        Path.home() / ".bashrc",
+        Path.home() / ".bash_profile",
+        Path.home() / ".zshrc",
+        Path.home() / ".profile",
+        Path.home() / ".config" / "fish" / "config.fish",
+    ]
+    rc_hits: list[Path] = []
+    for f in candidates:
+        if not f.exists():
+            continue
+        try:
+            body = f.read_text(encoding="utf-8", errors="replace")
+        except OSError:
+            continue
+        if re.search(r"HSA_OVERRIDE_GFX_VERSION", body):
+            rc_hits.append(f)
+
+    if not rc_hits:
+        print("\nNo persistent HSA_OVERRIDE_GFX_VERSION found in your shell rc files.")
+        return 0
+
+    print("\nPersistent HSA_OVERRIDE_GFX_VERSION found in:")
+    for f in rc_hits:
+        print(f"  - {f}")
+    print(
+        "\nRemove or comment those lines manually. apply_fix.py does NOT edit "
+        "your shell rc files for you; that's your dotfiles. Suggested:"
+    )
+    for f in rc_hits:
+        print(f"  $ $EDITOR {f}   # delete or comment the HSA_OVERRIDE_GFX_VERSION line")
+    return 0
+
+
+def _run_unset_override_windows(args, recipe: FixRecipe) -> int:
+    current = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
+    if current:
+        print(f"HSA_OVERRIDE_GFX_VERSION={current} is set in this shell.")
+        print("Note: clearing it in your Windows env scope does NOT affect this")
+        print("already-open shell -- close and reopen your terminal afterwards.")
+    else:
+        print("HSA_OVERRIDE_GFX_VERSION is not set in this shell.")
+
+    user_val = ""
+    machine_val = ""
+    rc, out, _ = _run([
+        "powershell", "-NoProfile", "-Command",
+        "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','User')",
+    ], timeout=8)
+    if rc == 0:
+        user_val = out.strip()
+    rc, out, _ = _run([
+        "powershell", "-NoProfile", "-Command",
+        "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','Machine')",
+    ], timeout=8)
+    if rc == 0:
+        machine_val = out.strip()
+
+    if not user_val and not machine_val:
+        print("\nNo persistent HSA_OVERRIDE_GFX_VERSION found in either the User")
+        print("or Machine env scope. You're done after closing/reopening shells.")
+        return 0
+
+    print("\nPersistent HSA_OVERRIDE_GFX_VERSION found in:")
+    if user_val:
+        print(f"  User scope:    {user_val}")
+    if machine_val:
+        print(f"  Machine scope: {machine_val}")
+
+    if user_val:
+        print('\nClear from the User scope (no admin needed):')
+        print('  Will run: setx HSA_OVERRIDE_GFX_VERSION ""')
+        if args.dry_run:
+            print("  (dry-run; not executed)")
+        elif _confirm("Clear HSA_OVERRIDE_GFX_VERSION from User scope?", args.yes):
+            rc, out, err = _run(["setx", "HSA_OVERRIDE_GFX_VERSION", ""], timeout=15)
+            if out: sys.stdout.write(out)
+            if err: sys.stderr.write(err)
+            if rc != 0:
+                print(f"setx exited {rc}; User scope NOT changed.")
+                return 4
+            print("Cleared from User scope. Reopen your terminal for it to take effect.")
+
+    if machine_val:
+        print(
+            "\nThe Machine scope value cannot be cleared without an Admin shell. "
+            "Either run an elevated PowerShell and execute:"
+        )
+        print("  [Environment]::SetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION', $null, 'Machine')")
+        print(
+            "or remove it through System Properties -> Environment Variables -> "
+            "System variables. apply_fix.py does NOT elevate itself."
+        )
+    return 0
+
+
+def run_path_export(args, recipe: FixRecipe) -> int:
+    """fix-6: persist the ROCm/HIP bin directory on PATH (with consent)."""
+    if platform.system().lower() == "windows":
+        return _run_path_export_windows(args, recipe)
+    return _run_path_export_linux(args, recipe)
+
+
+def _run_path_export_linux(args, recipe: FixRecipe) -> int:
+    """Append `/opt/rocm/bin` to ~/.bashrc (or ~/.zshrc).
+
+    Simplest possible thing: append a single line. We never reorder PATH,
+    we never edit /etc/environment. If the line is already there we exit
+    0 without re-appending.
+    """
+    bin_dir = "/opt/rocm/bin"
+    if not Path(bin_dir).is_dir():
+        print(f"{bin_dir} does not exist; nothing to add to PATH.")
+        return 3
+
+    shell = os.environ.get("SHELL", "")
+    rc_file = Path.home() / (".zshrc" if "zsh" in shell else ".bashrc")
+    if not rc_file.exists() and (Path.home() / ".bashrc").exists():
+        rc_file = Path.home() / ".bashrc"
+
+    export_line = f'export PATH="{bin_dir}:$PATH"'
+    existing = ""
+    if rc_file.exists():
+        try:
+            existing = rc_file.read_text(encoding="utf-8", errors="replace")
+        except OSError as exc:
+            print(f"Could not read {rc_file}: {exc}")
+            return 3
+        if re.search(rf"PATH=.*{re.escape(bin_dir)}", existing):
+            print(f"{rc_file} already adds {bin_dir} to PATH; no change.")
+            return 0
+
+    print(f"Plan: append the following line to {rc_file}:")
+    print(f"  {export_line}")
+    if args.dry_run:
+        print("(dry-run; not executed)")
+        return 0
+    if not _confirm(f"Append to {rc_file}?", args.yes):
+        return 5
+
+    try:
+        with rc_file.open("a", encoding="utf-8") as fh:
+            fh.write(f"\n# Added by rocm-doctor (apply_fix.py fix-6-path)\n")
+            fh.write(export_line + "\n")
+    except OSError as exc:
+        print(f"Failed to write {rc_file}: {exc}")
+        return 4
+
+    print(
+        f"Appended to {rc_file}. Open a new shell or run `source {rc_file}` "
+        "for the change to take effect."
+    )
+    return 0
+
+
+def _run_path_export_windows(args, recipe: FixRecipe) -> int:
+    """Append the HIP SDK's bin directory to the User PATH via setx.
+
+    `setx` is the only documented way to persist a User env var on
+    Windows without elevation. It rewrites the whole variable; here we
+    fetch the current User-scope PATH first, append our directory if it
+    isn't there yet, and write the result back.
+    """
+    sdk_path = os.environ.get("HIP_PATH", "")
+    if not sdk_path:
+        for root in (r"C:\Program Files\AMD\ROCm", r"C:\Program Files (x86)\AMD\ROCm"):
+            try:
+                base = Path(root)
+                if base.is_dir():
+                    for child in sorted(base.iterdir(), reverse=True):
+                        if child.is_dir() and re.match(r"\d+(\.\d+)+", child.name):
+                            sdk_path = str(child)
+                            break
+            except OSError:
+                continue
+            if sdk_path:
+                break
+    if not sdk_path:
+        print("No HIP SDK install found. Run fix-13-hip-sdk-missing first.")
+        return 3
+    bin_dir = str(Path(sdk_path) / "bin")
+    if not Path(bin_dir).is_dir():
+        print(f"{bin_dir} does not exist on disk; HIP SDK install looks incomplete.")
+        return 3
+
+    rc, out, _ = _run([
+        "powershell", "-NoProfile", "-Command",
+        "[Environment]::GetEnvironmentVariable('PATH','User')",
+    ], timeout=8)
+    user_path = out.strip() if rc == 0 else ""
+    if user_path and bin_dir.lower() in user_path.lower():
+        print(f"User PATH already contains {bin_dir}; no change.")
+        return 0
+    new_path = (user_path + ";" + bin_dir).lstrip(";") if user_path else bin_dir
+
+    print(f"Plan: prepend {bin_dir} to your User PATH:")
+    print(f"  setx PATH \"{new_path}\"")
+    if args.dry_run:
+        print("(dry-run; not executed)")
+        return 0
+    if not _confirm("Update User PATH?", args.yes):
+        return 5
+
+    rc, out, err = _run(["setx", "PATH", new_path], timeout=15)
+    if out: sys.stdout.write(out)
+    if err: sys.stderr.write(err)
+    if rc != 0:
+        print(f"setx exited {rc}; User PATH NOT changed.")
+        return 4
+    print(
+        f"Added {bin_dir} to your User PATH. setx only takes effect in NEW "
+        "shells -- close this terminal and reopen it before re-running hipInfo."
+    )
+    return 0
+
+
+def run_hip_visible_devices(args, recipe: FixRecipe) -> int:
+    """fix-9: persist HIP_VISIBLE_DEVICES so the iGPU is hidden.
+
+    We DO NOT pick a device index automatically -- rocminfo / hipInfo
+    ordering can surprise even experienced users on dual-GPU laptops.
+    Instead, we print a guided query and accept --device-index as the
+    explicit input.
+    """
+    if platform.system().lower() == "windows":
+        return _run_hip_visible_devices_windows(args, recipe)
+    return _run_hip_visible_devices_linux(args, recipe)
+
+
+def _run_hip_visible_devices_linux(args, recipe: FixRecipe) -> int:
+    idx = args.device_index
+    if idx is None:
+        print(
+            "Run `rocminfo | grep -E 'Agent |Marketing|gfx'` and identify the "
+            "row of your DISCRETE GPU (the iGPU is typically Agent 1). Then "
+            "re-run apply_fix.py with --device-index N."
+        )
+        return 3
+
+    shell = os.environ.get("SHELL", "")
+    rc_file = Path.home() / (".zshrc" if "zsh" in shell else ".bashrc")
+    if not rc_file.exists() and (Path.home() / ".bashrc").exists():
+        rc_file = Path.home() / ".bashrc"
+
+    export_line = f'export HIP_VISIBLE_DEVICES={idx}'
+    if rc_file.exists():
+        try:
+            existing = rc_file.read_text(encoding="utf-8", errors="replace")
+        except OSError as exc:
+            print(f"Could not read {rc_file}: {exc}")
+            return 3
+        if re.search(r"HIP_VISIBLE_DEVICES=", existing):
+            print(
+                f"{rc_file} already sets HIP_VISIBLE_DEVICES; edit by hand "
+                "rather than appending a second copy."
+            )
+            return 0
+
+    print(f"Plan: append the following line to {rc_file}:")
+    print(f"  {export_line}")
+    if args.dry_run:
+        print("(dry-run; not executed)")
+        return 0
+    if not _confirm(f"Append to {rc_file}?", args.yes):
+        return 5
+    try:
+        with rc_file.open("a", encoding="utf-8") as fh:
+            fh.write("\n# Added by rocm-doctor (apply_fix.py fix-9-igpu-dgpu)\n")
+            fh.write(export_line + "\n")
+    except OSError as exc:
+        print(f"Failed to write {rc_file}: {exc}")
+        return 4
+    print(
+        f"Appended to {rc_file}. Open a new shell for the change to take effect, "
+        "then re-run your workload."
+    )
+    return 0
+
+
+def _run_hip_visible_devices_windows(args, recipe: FixRecipe) -> int:
+    idx = args.device_index
+    if idx is None:
+        print(
+            "Run the following to identify the discrete GPU's index:"
+        )
+        print(
+            '  & "$env:HIP_PATH\\bin\\hipInfo.exe" | '
+            'Select-String "device#|Name|gcnArchName"'
+        )
+        print(
+            "Then re-run apply_fix.py with --device-index N (the iGPU is "
+            "typically device# 0; the dGPU is usually device# 1)."
+        )
+        return 3
+
+    rc, out, _ = _run([
+        "powershell", "-NoProfile", "-Command",
+        "[Environment]::GetEnvironmentVariable('HIP_VISIBLE_DEVICES','User')",
+    ], timeout=8)
+    existing = out.strip() if rc == 0 else ""
+    if existing:
+        print(
+            f"User scope already sets HIP_VISIBLE_DEVICES={existing!r}; "
+            "remove or update it manually rather than overwriting from this script."
+        )
+        return 0
+
+    print("Plan: persist HIP_VISIBLE_DEVICES in the User env scope:")
+    print(f"  setx HIP_VISIBLE_DEVICES {idx}")
+    if args.dry_run:
+        print("(dry-run; not executed)")
+        return 0
+    if not _confirm("Set HIP_VISIBLE_DEVICES in the User scope?", args.yes):
+        return 5
+
+    rc, out, err = _run(["setx", "HIP_VISIBLE_DEVICES", str(idx)], timeout=15)
+    if out: sys.stdout.write(out)
+    if err: sys.stderr.write(err)
+    if rc != 0:
+        print(f"setx exited {rc}; HIP_VISIBLE_DEVICES NOT changed.")
+        return 4
+    print(
+        "setx only takes effect in NEW shells -- close this terminal and "
+        "reopen it before re-running your workload."
+    )
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Recipe registry. Mirrors the diagnosis catalog in `diagnose.py`. Only the
+# small, safe, well-bounded fixes are auto-applicable; everything else is
+# advisory and prints the plan only.
+# ---------------------------------------------------------------------------
+
+LINUX_AND_WINDOWS = frozenset({"linux", "windows"})
+LINUX_ONLY = frozenset({"linux"})
+WINDOWS_ONLY = frozenset({"windows"})
+
+
+RECIPES: dict[str, FixRecipe] = {
+    "fix-1-arch": FixRecipe(
+        fix_id="fix-1-arch",
+        title="GPU gfx target not in framework arch list",
+        rationale=(
+            "Your GPU's gfx target is not in the framework wheel's compiled "
+            "kernel list. Re-install the framework from an index that includes "
+            "this gfx, OR rebuild llama.cpp with AMDGPU_TARGETS=<gfx>."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_AND_WINDOWS,
+        commands=[
+            "# PyTorch (Linux): switch to the ROCm nightly that ships the gfx115x kernels.",
+            "pip uninstall -y torch torchvision torchaudio",
+            "pip install --pre torch torchvision torchaudio \\",
+            "  --index-url https://download.pytorch.org/whl/nightly/rocm6.4",
+            "# PyTorch (Windows): use TheRock's per-gfx wheels (https://github.com/ROCm/TheRock).",
+            "# llama.cpp:",
+            "# cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=<your_gfx_target>",
+            "# cmake --build build -j",
+        ],
+        notes=[
+            "TheRock per-gfx wheels are the recommended fallback when the "
+            "official pytorch index does not yet cover your gfx (and the only "
+            "first-party option on Windows AMD).",
+            "HSA_OVERRIDE_GFX_VERSION is NOT the right fix here -- it papers "
+            "over the mismatch and risks page faults at runtime.",
+        ],
+        verify="python -c \"import torch; print(torch.cuda.is_available(), torch.cuda.get_arch_list())\"",
+    ),
+    "fix-2-unset-override": FixRecipe(
+        fix_id="fix-2-unset-override",
+        title="Unset HSA_OVERRIDE_GFX_VERSION",
+        rationale=(
+            "HSA_OVERRIDE_GFX_VERSION is set, but your GPU now has a native "
+            "wheel. The override hides the real gfx and causes page faults / "
+            "OUT_OF_REGISTERS at runtime."
+        ),
+        auto_applicable=True,
+        applies_on=LINUX_AND_WINDOWS,
+        commands=[
+            "# Linux:",
+            "unset HSA_OVERRIDE_GFX_VERSION",
+            "# Then remove the line from ~/.bashrc / ~/.zshrc / ~/.profile.",
+            "# Windows:",
+            'setx HSA_OVERRIDE_GFX_VERSION ""',
+            "# Or remove via System Properties -> Environment Variables.",
+        ],
+        runner=run_unset_override,
+        verify="env | grep HSA_OVERRIDE_GFX_VERSION || echo OK_UNSET",
+    ),
+    "fix-3-rocm-kernel": FixRecipe(
+        fix_id="fix-3-rocm-kernel",
+        title="ROCm/distro/kernel triple unsupported",
+        rationale=(
+            "ROCm is installed but your kernel/distro combination is outside "
+            "the supported matrix. Match the kernel to the matrix before "
+            "reinstalling, or rerun with --no-dkms and accept the risk."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_ONLY,
+        commands=[
+            "# Cross-check the live AMD matrix before changing anything:",
+            "#   https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html",
+            "# Common fix on Ubuntu: install the HWE kernel that matches your ROCm release, then reboot.",
+        ],
+        needs_reboot=True,
+        verify="lsmod | grep amdgpu && rocminfo | head -n 5",
+    ),
+    "fix-4-render-group": FixRecipe(
+        fix_id="fix-4-render-group",
+        title="Add user to render/video groups",
+        rationale=(
+            "The current user can't open /dev/kfd because they aren't in the "
+            "render group. Adding the user is the safe, standard fix."
+        ),
+        auto_applicable=True,
+        applies_on=LINUX_ONLY,
+        commands=['sudo usermod -a -G render,video "$USER"'],
+        needs_sudo=True,
+        needs_relogin=True,
+        runner=run_render_group,
+        verify="groups | tr ' ' '\\n' | grep -E '^(render|video)$' && rocminfo | head -n 5",
+    ),
+    "fix-5-amdgpu-load": FixRecipe(
+        fix_id="fix-5-amdgpu-load",
+        title="Load amdgpu (and clear any blacklist)",
+        rationale=(
+            "The amdgpu kernel module is not loaded. Check /etc/modprobe.d "
+            "for a blacklist entry, regenerate the initramfs, and modprobe."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_ONLY,
+        commands=[
+            "grep -RIl 'blacklist amdgpu' /etc/modprobe.d /usr/lib/modprobe.d 2>/dev/null || true",
+            "sudo $EDITOR <file shown above>     # remove the blacklist line",
+            "sudo update-initramfs -u            # Debian/Ubuntu",
+            "sudo dracut -f                      # Fedora/RHEL",
+            "sudo modprobe amdgpu",
+        ],
+        needs_sudo=True,
+        needs_reboot=True,
+        verify="lsmod | grep amdgpu && rocminfo | head -n 5",
+        notes=[
+            "If Secure Boot is enabled and amdgpu still won't load, the DKMS "
+            "module isn't signed. Either sign it with mokutil or disable "
+            "Secure Boot in firmware.",
+        ],
+    ),
+    "fix-6-path": FixRecipe(
+        fix_id="fix-6-path",
+        title="Add the ROCm/HIP bin directory to PATH",
+        rationale=(
+            "Linux: ROCm is installed at /opt/rocm but its bin directory isn't "
+            "on PATH, so `rocminfo` / `hipcc` aren't visible to the shell. "
+            "Windows: the HIP SDK is installed but its bin directory isn't on "
+            "the User PATH, so `hipInfo.exe` and the runtime DLLs can't be found."
+        ),
+        auto_applicable=True,
+        applies_on=LINUX_AND_WINDOWS,
+        commands=[
+            "# Linux:",
+            'echo \'export PATH="/opt/rocm/bin:$PATH"\' >> ~/.bashrc',
+            "# Windows:",
+            'setx PATH "%PATH%;C:\\Program Files\\AMD\\ROCm\\<version>\\bin"',
+        ],
+        runner=run_path_export,
+        verify="rocminfo | head -n 5 && hipcc --version",
+    ),
+    "fix-7-stale-repos": FixRecipe(
+        fix_id="fix-7-stale-repos",
+        title="Quarantine duplicate AMD repos",
+        rationale=(
+            "More than one ROCm/AMDGPU repo file exists. The package manager "
+            "is mixing versions; quarantine the extras before reinstalling."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_ONLY,
+        commands=[
+            "ls /etc/apt/sources.list.d/ | grep -iE 'rocm|amdgpu|radeon'",
+            "# For each duplicate file:",
+            "sudo mv /etc/apt/sources.list.d/<file>.list /etc/apt/sources.list.d/<file>.list.bak",
+            "sudo apt update",
+        ],
+        needs_sudo=True,
+        verify="sudo apt update 2>&1 | tail -n 20",
+    ),
+    "fix-8-wheel-rocm": FixRecipe(
+        fix_id="fix-8-wheel-rocm",
+        title="Reinstall the framework against the system ROCm/HIP major",
+        rationale=(
+            "The framework's bundled HIP version doesn't match the system "
+            "ROCm (Linux) or HIP SDK (Windows). libamdhip64.so.X / "
+            "amdhip64_X.dll load failures are the usual signal."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_AND_WINDOWS,
+        commands=[
+            "pip uninstall -y torch torchvision torchaudio",
+            "# Linux: pick the index that matches your system ROCm major:",
+            "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4",
+            "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3",
+            "# Windows: use TheRock's wheels matching your HIP SDK major:",
+            "#   https://github.com/ROCm/TheRock",
+        ],
+        verify="python -c \"import torch; print(torch.__version__, torch.version.hip, torch.cuda.is_available())\"",
+    ),
+    "fix-9-igpu-dgpu": FixRecipe(
+        fix_id="fix-9-igpu-dgpu",
+        title="Hide the iGPU with HIP_VISIBLE_DEVICES",
+        rationale=(
+            "Both an APU iGPU and a discrete AMD GPU are visible. Pin the "
+            "runtime to the dGPU so the iGPU doesn't destabilise it."
+        ),
+        auto_applicable=True,
+        applies_on=LINUX_AND_WINDOWS,
+        commands=[
+            "# Linux:",
+            "rocminfo | grep -E 'Agent |Marketing|gfx'   # find the dGPU index",
+            "export HIP_VISIBLE_DEVICES=<dGPU-index>",
+            "# Windows:",
+            '& "$env:HIP_PATH\\bin\\hipInfo.exe" | Select-String "device#|Name"',
+            "setx HIP_VISIBLE_DEVICES <dGPU-index>",
+        ],
+        runner=run_hip_visible_devices,
+        verify="python -c \"import torch; print(torch.cuda.device_count(), torch.cuda.get_device_name(0))\"",
+        notes=[
+            "Pass --device-index N to persist the env var; without it, "
+            "this fix only prints the rocminfo / hipInfo query so you can identify N.",
+        ],
+    ),
+    "fix-10-container": FixRecipe(
+        fix_id="fix-10-container",
+        title="Re-launch the container with AMD devices passed through",
+        rationale=(
+            "The container can't see /dev/kfd or /dev/dri/renderD*. Pass the "
+            "devices and the host's render group via the runtime flags."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_ONLY,
+        commands=[
+            "docker run --rm -it \\",
+            "  --device=/dev/kfd \\",
+            "  --device=/dev/dri \\",
+            "  --group-add render \\",
+            "  --security-opt seccomp=unconfined \\",
+            "  --shm-size=8g \\",
+            "  rocm/pytorch:latest",
+        ],
+        verify="rocminfo | head -n 5",
+        notes=[
+            "Rootless podman additionally needs `--userns=keep-id` and a "
+            "host user that is in the render group; podman maps it through.",
+        ],
+    ),
+    "fix-11-iommu": FixRecipe(
+        fix_id="fix-11-iommu",
+        title="Add iommu=pt to the kernel command line",
+        rationale=(
+            "Multi-GPU jobs hang when the IOMMU is in the default 'on' mode "
+            "with translation; pass-through mode fixes the hang. This requires "
+            "editing GRUB and rebooting; we will not do that for you."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_ONLY,
+        commands=[
+            "cat /proc/cmdline",
+            "sudo $EDITOR /etc/default/grub        # add iommu=pt to GRUB_CMDLINE_LINUX_DEFAULT",
+            "sudo update-grub                       # Debian/Ubuntu",
+            "sudo grub2-mkconfig -o /boot/grub2/grub.cfg   # Fedora/RHEL",
+            "# Reboot, then retry the multi-GPU workload.",
+        ],
+        needs_sudo=True,
+        needs_reboot=True,
+        verify="cat /proc/cmdline | grep -o 'iommu=\\w*'",
+    ),
+    "fix-12-installer": FixRecipe(
+        fix_id="fix-12-installer",
+        title="Reset amdgpu-install state and reinstall",
+        rationale=(
+            "amdgpu-install left a half-configured DKMS / repo state. Run "
+            "the documented uninstall, clean up, and reinstall without the "
+            "flag that broke things (commonly --accept-eula on newer installers)."
+        ),
+        auto_applicable=False,
+        applies_on=LINUX_ONLY,
+        commands=[
+            "sudo amdgpu-install --uninstall",
+            "sudo apt autoremove --purge -y",
+            "sudo apt update",
+            "sudo amdgpu-install --usecase=rocm,hip",
+        ],
+        needs_sudo=True,
+        needs_reboot=True,
+        verify="dpkg -l | grep -E 'rocm|amdgpu' | head -n 20 && rocminfo | head -n 5",
+        notes=[
+            "If `apt autoremove --purge` warns it will remove unrelated "
+            "packages, stop and resolve those by hand before continuing.",
+        ],
+    ),
+    "fix-13-hip-sdk-missing": FixRecipe(
+        fix_id="fix-13-hip-sdk-missing",
+        title="Install the AMD HIP SDK for Windows",
+        rationale=(
+            "Your framework links against HIP but the HIP SDK isn't installed "
+            "on this host. The runtime DLLs (amdhip64_X.dll, hipblas.dll, "
+            "hsa-runtime64.dll) and hipInfo.exe ship inside the SDK installer."
+        ),
+        auto_applicable=False,
+        applies_on=WINDOWS_ONLY,
+        commands=[
+            "# Download and install the HIP SDK (matched to your framework's HIP major):",
+            "#   https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html",
+            "# After install, reopen the shell so HIP_PATH and PATH pick up the new install.",
+        ],
+        verify=(
+            'powershell -NoProfile -Command '
+            '"& \\"$env:HIP_PATH\\bin\\hipInfo.exe\\" | Select-Object -First 5"'
+        ),
+        notes=[
+            "If you only need PyTorch on Windows AMD and don't need the C/C++ "
+            "HIP toolchain, the TheRock wheels bundle their own HIP runtime "
+            "and may not require a system HIP SDK install.",
+        ],
+    ),
+    "fix-14-adrenalin-too-old": FixRecipe(
+        fix_id="fix-14-adrenalin-too-old",
+        title="Update the Adrenalin / kernel-mode driver",
+        rationale=(
+            "The HIP SDK is installed but the AMD kernel-mode driver "
+            "(Adrenalin / Adrenalin Pro) is older than the SDK release notes "
+            "call out. The user-space SDK and the driver have to match."
+        ),
+        auto_applicable=False,
+        applies_on=WINDOWS_ONLY,
+        commands=[
+            "# Cross-check the HIP SDK release notes for the exact driver pairing:",
+            "#   https://rocm.docs.amd.com/projects/install-on-windows/en/latest/install/install.html",
+            "# Then download the matching driver from:",
+            "#   https://www.amd.com/en/support",
+            "# Reboot after the install for the kernel-mode driver to take effect.",
+        ],
+        needs_reboot=True,
+        verify=(
+            'powershell -NoProfile -Command '
+            '"(Get-CimInstance Win32_VideoController | '
+            "Where-Object { $_.Name -like '*AMD*' -or $_.Name -like '*Radeon*' } | "
+            'Select-Object -First 1).DriverVersion"'
+        ),
+    ),
+    "fix-15-msvc-redist": FixRecipe(
+        fix_id="fix-15-msvc-redist",
+        title="Install the MSVC 2015-2022 runtime redistributable",
+        rationale=(
+            "The HIP SDK's amdhip64_X.dll links against the MSVC 2015-2022 "
+            "runtime. When vcruntime140.dll / vcruntime140_1.dll aren't on "
+            "PATH, `import torch` fails with a missing-DLL error that points "
+            "at vcruntime140_1.dll, not at the HIP runtime itself."
+        ),
+        auto_applicable=False,
+        applies_on=WINDOWS_ONLY,
+        commands=[
+            "# Download and install (x64):",
+            "#   https://aka.ms/vs/17/release/vc_redist.x64.exe",
+            "# After the install, reopen the shell and re-run your import / hipInfo check.",
+        ],
+        verify="where vcruntime140.dll && where vcruntime140_1.dll",
+        notes=[
+            "If installing the redistributable still leaves a missing-DLL "
+            "error, the failing DLL is probably amdhip64_X.dll itself; that "
+            "points at fix-13-hip-sdk-missing rather than this fix.",
+        ],
+    ),
+}
+
+
+def _list_recipes() -> None:
+    print("Available fix-ids (mirror diagnose.py):")
+    for r in RECIPES.values():
+        kind = "AUTO" if r.auto_applicable else "PRINT-ONLY"
+        scope = "/".join(sorted(r.applies_on))
+        print(f"  [{kind:>10s}] [{scope:>14s}] {r.fix_id}  -- {r.title}")
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--fix-id",
+        help="Stable fix identifier from diagnose.py (e.g. fix-4-render-group).",
+    )
+    parser.add_argument(
+        "--list", action="store_true",
+        help="List every fix-id and exit.",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Show the plan without changing anything.",
+    )
+    parser.add_argument(
+        "--yes", action="store_true",
+        help="Skip the interactive confirmation. Use only when the user has "
+             "already approved the plan in chat.",
+    )
+    parser.add_argument(
+        "--device-index", type=int, default=None,
+        help="For fix-9-igpu-dgpu: the rocminfo Agent index of the discrete GPU.",
+    )
+    parser.add_argument(
+        "--json", action="store_true",
+        help="Emit the recipe as JSON instead of running it.",
+    )
+    args = parser.parse_args(argv)
+
+    if args.list:
+        _list_recipes()
+        return 0
+
+    if not args.fix_id:
+        parser.error("--fix-id or --list is required")
+
+    recipe = RECIPES.get(args.fix_id)
+    if recipe is None:
+        print(f"Unknown fix-id: {args.fix_id}", file=sys.stderr)
+        print("Run `python scripts/apply_fix.py --list` for the full list.", file=sys.stderr)
+        return 2
+
+    if args.json:
+        # Strip the runner callable; it isn't JSON-serialisable. Convert
+        # the frozenset for `applies_on` into a sorted list so the JSON
+        # output is stable.
+        d = {}
+        for k, v in recipe.__dict__.items():
+            if k == "runner":
+                continue
+            if k == "applies_on":
+                d[k] = sorted(v)
+            else:
+                d[k] = v
+        print(json.dumps(d, indent=2))
+        return 0
+
+    _print_recipe(recipe)
+    print()
+
+    sysname = platform.system().lower()
+    if sysname not in recipe.applies_on:
+        print(
+            f"This fix only applies on: {', '.join(sorted(recipe.applies_on))}. "
+            f"Running OS is: {sysname}."
+        )
+        return 3
+
+    if not recipe.auto_applicable:
+        print("This fix is print-only (manual change required).")
+        print("Copy the commands above, run them yourself, then verify with:")
+        if recipe.verify:
+            print(f"  $ {recipe.verify}")
+        return 0
+
+    if recipe.runner is None:
+        # Defensive: an auto_applicable recipe with no runner is a bug.
+        print("Internal error: auto-applicable recipe has no runner.", file=sys.stderr)
+        return 4
+    return recipe.runner(args, recipe)  # type: ignore[misc]
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/rocm-doctor/scripts/diagnose.py b/skills/rocm-doctor/scripts/diagnose.py
new file mode 100644
index 0000000..54f4982
--- /dev/null
+++ b/skills/rocm-doctor/scripts/diagnose.py
@@ -0,0 +1,1333 @@
+#!/usr/bin/env -S uv run --quiet
+# /// script
+# requires-python = ">=3.10"
+# dependencies = []
+# ///
+"""Match an `examine.py` snapshot against the rocm-doctor failure-mode list.
+
+This script is the opinionated decision tree the `rocm-doctor` skill is
+built around. It takes:
+
+  1. The JSON output of `examine.py` (machine state).
+  2. Optionally the user's error text (symptom).
+
+and returns a ranked list of matches against the catalog of known
+misconfigurations in `reference.md`. Each match comes with:
+
+  - id       : stable identifier reused by `apply_fix.py` (e.g. "fix-4-render-group").
+  - title    : one-line description of the failure mode.
+  - score    : 0..100 confidence the user is hitting this case.
+  - evidence : the concrete facts the score is based on.
+  - fix      : the next action and a `verify` command the agent can re-run.
+
+Usage:
+    python scripts/examine.py --json > exam.json
+    python scripts/diagnose.py --exam exam.json
+    python scripts/diagnose.py --exam exam.json --symptom "HIP error: invalid device function"
+    python scripts/diagnose.py --exam exam.json --json
+    python scripts/diagnose.py --exam exam.json --top 3
+
+Exit codes:
+  0 = at least one diagnosis matched (score >= MIN_SCORE_FOR_MATCH).
+  1 = nothing matched; this is the explicit "I don't recognise this failure
+      mode" path. The agent should NOT speculate; it should hand the user
+      the upstream tracker URL printed by --json.
+  2 = exam JSON is missing or malformed.
+
+The closed list is deliberate. New failure modes go through a code change
+here; they do not get invented by the agent at runtime.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Callable
+
+# A score above this threshold is treated as "we think this is it".
+# Tuned so that a single direct symptom keyword match (worth ~40) plus a
+# corroborating state signal (worth ~20+) is enough to surface a diagnosis.
+MIN_SCORE_FOR_MATCH = 50
+
+# Above this score we tell the agent to propose the fix immediately; below
+# it (but above MIN_SCORE_FOR_MATCH) we surface as "likely" and ask the
+# user to confirm one more piece of evidence first.
+HIGH_CONFIDENCE = 75
+
+# Upstream router used when nothing matches. Keeping the URL list short so
+# the agent has exactly one place to send each kind of report.
+UPSTREAM_TRACKERS = {
+    "rocm-core":   "https://github.com/ROCm/ROCm/issues",
+    "pytorch":     "https://github.com/pytorch/pytorch/issues  (tag with rocm label)",
+    "llama-cpp":   "https://github.com/ggml-org/llama.cpp/issues",
+    "lemonade":    "https://github.com/lemonade-sdk/lemonade/issues",
+    "ollama":      "https://github.com/ollama/ollama/issues",
+    "lm-studio":   "https://lmstudio.ai/docs/app  (use in-app support; no public repo)",
+    "amdgpu-install": "https://repo.radeon.com  (raise via your AMD support contact)",
+}
+
+
+@dataclass
+class Fix:
+    summary: str                          # one-line plan
+    commands: list[str] = field(default_factory=list)
+    needs_sudo: bool = False
+    needs_reboot: bool = False
+    needs_relogin: bool = False
+    fix_id: str = ""                      # passed to apply_fix.py --fix-id
+    auto_applicable: bool = False         # True iff apply_fix.py can run it
+    notes: list[str] = field(default_factory=list)
+    verify: str = ""                      # command the agent should run after
+
+
+@dataclass
+class Diagnosis:
+    id: str
+    title: str
+    score: int
+    evidence: list[str] = field(default_factory=list)
+    fix: Fix | None = None
+
+
+# ---------------------------------------------------------------------------
+# Symptom keyword tables. Each tuple is (regex, weight, label-for-evidence).
+# Weights are tuned so that one specific error message (libamdhip64.so.X,
+# HSA_STATUS_ERROR_INVALID_ISA) is enough to dominate the diagnosis on its
+# own, while vague matches (the word "hang") only nudge the score.
+# ---------------------------------------------------------------------------
+
+KEYWORDS_INVALID_ISA = [
+    (r"hiperrornobinaryforgpu", 45, "error mentions hipErrorNoBinaryForGpu"),
+    (r"hsa_status_error_invalid_isa", 50, "error mentions HSA_STATUS_ERROR_INVALID_ISA"),
+    (r"invalid device function", 40, "error mentions 'invalid device function'"),
+    (r"no kernel image is available", 35, "error mentions 'no kernel image is available'"),
+    (r"gfx\d{3,4}.* not (?:in|on) .*arch", 35, "error names a missing gfx in arch list"),
+]
+
+KEYWORDS_KFD_PERMISSION = [
+    (r"unable to open /dev/kfd", 50, "error mentions /dev/kfd open failure"),
+    (r"/dev/kfd.*permission denied", 45, "error mentions /dev/kfd permission denied"),
+    (r"hsa_status_error_out_of_resources", 25, "HSA out-of-resources (often perms)"),
+    (r"failed to open kfd", 35, "error mentions kfd open failure"),
+]
+
+KEYWORDS_MODULE_NOT_LOADED = [
+    (r"rock module is not loaded", 50, "rocminfo says ROCk module is NOT loaded"),
+    (r"no devices? found", 20, "vague 'no devices found'"),
+    (r"hsa_status_error", 10, "HSA error (broad)"),
+]
+
+KEYWORDS_PATH_MISSING = [
+    (r"rocminfo: command not found", 50, "rocminfo not on PATH"),
+    (r"command not found.*hipcc", 40, "hipcc not on PATH"),
+    (r"/opt/rocm/bin", 15, "user mentions /opt/rocm/bin"),
+]
+
+KEYWORDS_LIB_MISMATCH = [
+    (r"libamdhip64\.so", 50, "error mentions libamdhip64.so"),
+    (r"libhsa-runtime", 45, "error mentions libhsa-runtime"),
+    (r"libhipblas", 40, "error mentions libhipblas"),
+    (r"amdhip64_\d+\.dll", 50, "error mentions amdhip64_X.dll (Windows)"),
+    (r"hipblas\.dll", 40, "error mentions hipblas.dll (Windows)"),
+    (r"cannot open shared object file", 25, "ldopen failure"),
+    (r"dll load failed", 25, "Windows DLL load failure"),
+    (r"version `?glibc", 5, "tangential glibc version error"),
+]
+
+KEYWORDS_HIP_SDK_MISSING = [
+    (r"amdhip64.*not found", 50, "error names amdhip64 missing"),
+    (r"could not find hip", 40, "error mentions HIP not found"),
+    (r"hip_path.*not set", 35, "user mentions HIP_PATH unset"),
+    (r"hipinfo.*not recognized", 45, "Windows says hipInfo is not a command"),
+]
+
+KEYWORDS_MSVC_REDIST = [
+    (r"vcruntime140(?:_1)?\.dll", 50, "error mentions vcruntime140 / vcruntime140_1"),
+    (r"api-ms-win-crt-.*\.dll", 35, "error mentions api-ms-win-crt-* DLL"),
+    (r"the (program|application) can't start because", 25, "Windows missing-DLL dialog text"),
+    (r"msvcp140\.dll", 30, "error mentions msvcp140.dll"),
+]
+
+KEYWORDS_REPO_BROKEN = [
+    (r"404.*repo\.radeon\.com", 50, "404 against repo.radeon.com"),
+    (r"release file (is )?not (yet )?valid", 30, "apt 'release file not valid'"),
+    (r"the following packages have unmet dependencies", 25, "apt unmet dependencies"),
+    (r"unable to locate package rocm", 35, "apt cannot find ROCm package"),
+]
+
+KEYWORDS_CONTAINER = [
+    (r"hsa_status_error.*permission", 20, "HSA permission error (often container)"),
+    (r"/dev/dri.*permission", 30, "/dev/dri permission failure"),
+    (r"failed to open device", 25, "device open failure"),
+]
+
+KEYWORDS_IOMMU_HANG = [
+    (r"hang", 20, "user mentions 'hang'"),
+    (r"deadlock", 20, "user mentions deadlock"),
+    (r"timed out waiting", 25, "ring/queue timeout"),
+    (r"iommu", 30, "user mentions iommu"),
+]
+
+KEYWORDS_DPKG_BROKEN = [
+    (r"half[- ]configured", 50, "dpkg 'half-configured'"),
+    (r"dkms .*failed", 45, "DKMS build failure"),
+    (r"dpkg: error", 25, "generic dpkg error"),
+    (r"sub-process /usr/bin/dpkg returned", 25, "apt mentions dpkg failure"),
+    (r"--accept-eula", 40, "user mentions --accept-eula"),
+]
+
+KEYWORDS_PAGE_FAULT = [
+    (r"page fault", 40, "user mentions page fault"),
+    (r"vm_fault", 35, "kernel vm_fault"),
+    (r"hw_fault", 30, "amdgpu HW fault"),
+    (r"out_of_registers", 30, "compiler OUT_OF_REGISTERS"),
+]
+
+
+def _keyword_score(symptom: str, table: list[tuple[str, int, str]]) -> tuple[int, list[str]]:
+    """Return (score, evidence_lines) for the strongest matches in `table`.
+
+    We DO NOT sum every match: a long error string mentioning the same
+    underlying problem in two ways shouldn't double-count. Instead we take
+    the top two distinct hits and sum those. That keeps signal strong but
+    bounded.
+    """
+    if not symptom:
+        return 0, []
+    sym = symptom.lower()
+    hits: list[tuple[int, str]] = []
+    for pattern, weight, label in table:
+        if re.search(pattern, sym):
+            hits.append((weight, label))
+    if not hits:
+        return 0, []
+    hits.sort(reverse=True)
+    top = hits[:2]
+    return sum(h[0] for h in top), [h[1] for h in top]
+
+
+# ---------------------------------------------------------------------------
+# Examination accessors. The script accepts either the dict that
+# `examine.py --json` emits OR a Python dict the agent has constructed by
+# hand. We avoid pulling in the dataclass module here to keep diagnose.py
+# usable standalone.
+# ---------------------------------------------------------------------------
+
+def _g(exam: dict, *path: str, default: Any = None) -> Any:
+    """Safe nested-key getter."""
+    cur: Any = exam
+    for p in path:
+        if not isinstance(cur, dict):
+            return default
+        if p not in cur:
+            return default
+        cur = cur[p]
+    return cur if cur is not None else default
+
+
+def _amd_gpus(exam: dict) -> list[dict]:
+    return [g for g in _g(exam, "gpus", default=[]) if isinstance(g, dict) and g.get("is_amd")]
+
+
+def _amd_gfx_targets(exam: dict) -> list[str]:
+    return [g.get("gfx_target", "") for g in _amd_gpus(exam) if g.get("gfx_target")]
+
+
+# ---------------------------------------------------------------------------
+# Per-misconfiguration checkers
+#
+# Each `check_*` function returns a Diagnosis with score=0 to mean "not a
+# match". `run_all_checks` filters those out. The MIN_SCORE_FOR_MATCH
+# threshold then promotes the survivors to "we think this is it".
+# ---------------------------------------------------------------------------
+
+def check_1_arch_not_in_wheel(exam: dict, symptom: str) -> Diagnosis:
+    """GPU gfx target not in the framework's build arch list."""
+    score = 0
+    evidence: list[str] = []
+
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_INVALID_ISA)
+    score += kw_score
+    evidence += kw_ev
+
+    framework_arch = _g(exam, "framework_arch_list", default=[]) or []
+    gfx_targets = _amd_gfx_targets(exam)
+    # Direct check: any AMD gfx target in the system that is NOT in the
+    # framework's arch list. This is the strongest possible signal.
+    missing = [t for t in gfx_targets if framework_arch and t not in framework_arch]
+    if framework_arch and gfx_targets:
+        if missing:
+            score += 55
+            evidence.append(
+                f"GPU gfx target(s) {missing} not in framework arch list {framework_arch}"
+            )
+        else:
+            # Strong negative: every GPU is covered. Push score down so a
+            # weak symptom keyword alone doesn't surface this diagnosis.
+            score -= 30
+            evidence.append(
+                f"framework arch list {framework_arch} already includes GPU target(s) {gfx_targets}"
+            )
+
+    framework = _g(exam, "framework", default="")
+    if framework in ("pytorch", "llama-cpp") and not framework_arch and gfx_targets:
+        # We at least know there is a framework and a GPU; can't confirm
+        # without arch list, but the symptom keywords still apply.
+        evidence.append(
+            "Framework arch list unknown -- cannot confirm without "
+            "`python -c 'import torch; print(torch.cuda.get_arch_list())'`."
+        )
+
+    if score <= 0:
+        return Diagnosis(id="fix-1-arch", title="GPU gfx not in framework arch list", score=0)
+
+    fix = Fix(
+        summary=(
+            "Reinstall the framework from a wheel index that includes this GPU's "
+            "gfx target. Use HSA_OVERRIDE_GFX_VERSION ONLY as a temporary "
+            "workaround when no native wheel exists."
+        ),
+        commands=[
+            "# Recommended: PyTorch ROCm nightly that ships the gfx115x kernels.",
+            "pip uninstall -y torch torchvision torchaudio",
+            "pip install --pre torch torchvision torchaudio \\\n"
+            "  --index-url https://download.pytorch.org/whl/nightly/rocm6.4",
+            "# llama.cpp: rebuild with AMDGPU_TARGETS set to this GPU's gfx.",
+            "# cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=<gfx_target>",
+        ],
+        fix_id="fix-1-arch",
+        auto_applicable=False,
+        verify=(
+            "python -c \"import torch; print(torch.cuda.is_available(), "
+            "torch.cuda.get_arch_list())\""
+        ),
+        notes=[
+            "TheRock (rocm/TheRock) ships nightly per-gfx wheels and is the "
+            "preferred fallback when the official pytorch wheel index does "
+            "not yet cover your gfx target.",
+        ],
+    )
+    return Diagnosis(
+        id="fix-1-arch",
+        title="GPU gfx target not in framework's build arch list",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_2_hsa_override_unneeded(exam: dict, symptom: str) -> Diagnosis:
+    """HSA_OVERRIDE_GFX_VERSION set on a GPU that now has native support."""
+    env = _g(exam, "env", default={}) or {}
+    override = env.get("HSA_OVERRIDE_GFX_VERSION", "")
+    if not override:
+        return Diagnosis(id="fix-2-unset-override", title="HSA_OVERRIDE_GFX_VERSION set unnecessarily", score=0)
+
+    score = 30
+    evidence = [f"HSA_OVERRIDE_GFX_VERSION={override} is set in the current shell"]
+
+    # Page faults are the classic late-binding symptom of an override that
+    # masks the real gfx.
+    pf_score, pf_ev = _keyword_score(symptom, KEYWORDS_PAGE_FAULT)
+    score += pf_score
+    evidence += pf_ev
+    dmesg = _g(exam, "dmesg_amdgpu_tail", default=[]) or []
+    if any("page fault" in line.lower() for line in dmesg):
+        score += 20
+        evidence.append("kernel ring shows amdgpu page faults")
+
+    framework_arch = _g(exam, "framework_arch_list", default=[]) or []
+    gfx_targets = _amd_gfx_targets(exam)
+    if framework_arch and gfx_targets and all(t in framework_arch for t in gfx_targets):
+        score += 25
+        evidence.append(
+            f"every detected GPU target ({gfx_targets}) is in the framework arch "
+            f"list ({framework_arch}); the override is hiding the native gfx."
+        )
+
+    if _g(exam, "os_family", default="linux") == "windows":
+        fix = Fix(
+            summary="Clear HSA_OVERRIDE_GFX_VERSION (Windows) and use the native HIP SDK / wheel.",
+            commands=[
+                "# Inspect the User and Machine env scopes:",
+                "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','User')",
+                "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','Machine')",
+                "# Clear from the User scope (does NOT affect already-open shells):",
+                'setx HSA_OVERRIDE_GFX_VERSION ""',
+                "# Or remove via System Properties -> Environment Variables.",
+            ],
+            fix_id="fix-2-unset-override",
+            auto_applicable=True,
+            verify=(
+                "powershell -NoProfile -Command "
+                "\"[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','User')\""
+            ),
+        )
+    else:
+        fix = Fix(
+            summary="Unset HSA_OVERRIDE_GFX_VERSION and use the native wheel.",
+            commands=[
+                "unset HSA_OVERRIDE_GFX_VERSION",
+                "# Also remove it from ~/.bashrc / ~/.zshrc / ~/.profile if persisted.",
+            ],
+            fix_id="fix-2-unset-override",
+            auto_applicable=True,
+            verify=(
+                "env | grep HSA_OVERRIDE_GFX_VERSION || echo OK_UNSET; "
+                "python -c \"import torch; print(torch.cuda.is_available())\""
+            ),
+        )
+    return Diagnosis(
+        id="fix-2-unset-override",
+        title="HSA_OVERRIDE_GFX_VERSION set on a GPU that has a native wheel",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_3_rocm_kernel_unsupported(exam: dict, symptom: str) -> Diagnosis:
+    """ROCm <-> distro/kernel unsupported triple."""
+    score = 0
+    evidence: list[str] = []
+
+    kernel = _g(exam, "kernel_release", default="")
+    distro = _g(exam, "distro_id", default="")
+    distro_v = _g(exam, "distro_version", default="")
+    rocm_version = _g(exam, "rocm_version", default="")
+    amdgpu_loaded = _g(exam, "amdgpu_loaded", default=None)
+
+    if rocm_version and amdgpu_loaded is False:
+        score += 30
+        evidence.append(
+            f"ROCm {rocm_version} is installed but the amdgpu kernel module is not loaded; "
+            "this is typical when DKMS failed against an unsupported kernel."
+        )
+
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_DPKG_BROKEN)
+    if kw_ev and any("dkms" in e.lower() for e in kw_ev):
+        score += 30
+        evidence += kw_ev
+
+    if kernel and rocm_version:
+        # We do NOT hardcode a matrix here -- it's stale within months.
+        # The check is purely "you have ROCm + amdgpu didn't load"; the
+        # fix points the user at the live AMD matrix page.
+        pass
+
+    if score <= 0:
+        return Diagnosis(id="fix-3-rocm-kernel", title="ROCm/distro/kernel triple unsupported", score=0)
+
+    fix = Fix(
+        summary=(
+            "Cross-check your kernel/distro against the live AMD compatibility "
+            "matrix before reinstalling."
+        ),
+        commands=[
+            f"# Current: kernel={kernel} distro={distro} {distro_v} rocm={rocm_version}",
+            "# Compare to the live AMD matrix:",
+            "#   https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html",
+            "# If your kernel is above the supported range, install the HWE",
+            "# kernel that matches ROCm, or rerun amdgpu-install with --no-dkms.",
+        ],
+        fix_id="fix-3-rocm-kernel",
+        auto_applicable=False,
+        needs_reboot=True,
+        verify="lsmod | grep amdgpu && rocminfo | head -n 20",
+    )
+    return Diagnosis(
+        id="fix-3-rocm-kernel",
+        title="ROCm version + distro/kernel form an unsupported triple",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_4_render_group(exam: dict, symptom: str) -> Diagnosis:
+    """User not in render/video groups, or /dev/kfd group is wrong."""
+    score = 0
+    evidence: list[str] = []
+
+    in_render = _g(exam, "in_render_group", default=None)
+    in_video = _g(exam, "in_video_group", default=None)
+    kfd = _g(exam, "kfd", default=None) or {}
+    if in_render is False:
+        score += 35
+        evidence.append("user is NOT in the 'render' group")
+    if in_video is False:
+        score += 10
+        evidence.append("user is NOT in the 'video' group")
+    if kfd.get("exists") is True and kfd.get("user_can_write") is False:
+        score += 25
+        evidence.append(
+            f"/dev/kfd exists (mode {kfd.get('mode')}, group {kfd.get('owner_group')}) "
+            "but the current user can't write to it"
+        )
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_KFD_PERMISSION)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-4-render-group", title="User missing render/video group", score=0)
+
+    kfd_group = kfd.get("owner_group") or "render"
+    fix = Fix(
+        summary=f"Add the current user to '{kfd_group}' (and 'video' for safety) and log out/in.",
+        commands=[
+            f"sudo usermod -a -G {kfd_group},video \"$USER\"",
+        ],
+        needs_sudo=True,
+        needs_relogin=True,
+        fix_id="fix-4-render-group",
+        auto_applicable=True,
+        verify="groups | tr ' ' '\\n' | grep -E '^(render|video)$' && ls -l /dev/kfd && rocminfo | head -n 5",
+        notes=[
+            "Group membership only takes effect after a full re-login (or "
+            "reboot). `newgrp render` will give the current shell access "
+            "but not other terminals or services.",
+        ],
+    )
+    return Diagnosis(
+        id="fix-4-render-group",
+        title="User not in render/video group (or /dev/kfd owned by the other group)",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_5_amdgpu_blacklisted(exam: dict, symptom: str) -> Diagnosis:
+    """amdgpu module not loaded or actively blacklisted."""
+    score = 0
+    evidence: list[str] = []
+
+    amdgpu_loaded = _g(exam, "amdgpu_loaded", default=None)
+    blacklisted = _g(exam, "amdgpu_blacklisted_in", default=[]) or []
+    rocm_status = _g(exam, "rocminfo_status", default="")
+    secure_boot = _g(exam, "secure_boot", default="unknown")
+
+    if blacklisted:
+        score += 55
+        evidence.append(f"amdgpu is blacklisted in: {blacklisted}")
+    if amdgpu_loaded is False:
+        score += 35
+        evidence.append("amdgpu module is not loaded")
+    if rocm_status == "not-loaded":
+        score += 25
+        evidence.append("rocminfo says 'ROCk module is NOT loaded'")
+    if secure_boot == "enabled" and amdgpu_loaded is False:
+        score += 10
+        evidence.append(
+            "Secure Boot is enabled and amdgpu didn't load -- DKMS modules "
+            "are often blocked until you sign them or disable Secure Boot."
+        )
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_MODULE_NOT_LOADED)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-5-amdgpu-load", title="amdgpu not loaded", score=0)
+
+    commands: list[str] = []
+    if blacklisted:
+        for f in blacklisted:
+            commands.append(f"# Inspect & remove the blacklist line: sudo $EDITOR {f}")
+        commands.append("sudo update-initramfs -u   # Debian/Ubuntu")
+        commands.append("sudo dracut -f             # Fedora/RHEL")
+    commands.append("sudo modprobe amdgpu")
+    if secure_boot == "enabled":
+        commands.append(
+            "# Secure Boot is on; if amdgpu still won't load, the DKMS "
+            "module isn't signed. Sign it (mokutil) or disable Secure Boot."
+        )
+
+    fix = Fix(
+        summary="Remove amdgpu from any modprobe blacklist and load it.",
+        commands=commands,
+        needs_sudo=True,
+        needs_reboot=bool(blacklisted),
+        fix_id="fix-5-amdgpu-load",
+        auto_applicable=False,
+        verify="lsmod | grep amdgpu && rocminfo | head -n 5",
+    )
+    return Diagnosis(
+        id="fix-5-amdgpu-load",
+        title="amdgpu kernel module not loaded (or blacklisted)",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_6_path_missing(exam: dict, symptom: str) -> Diagnosis:
+    """ROCm/HIP binaries not on PATH after install."""
+    score = 0
+    evidence: list[str] = []
+
+    os_family = _g(exam, "os_family", default="linux")
+    env_path = _g(exam, "env", default={}).get("PATH", "")
+
+    if os_family == "windows":
+        sdk_path = _g(exam, "hip_sdk_path", default="")
+        hipinfo_present = _g(exam, "hipinfo_present", default=None)
+        bin_dir = f"{sdk_path}\\bin" if sdk_path else r"C:\Program Files\AMD\ROCm\<version>\bin"
+        if sdk_path and hipinfo_present is False:
+            score += 50
+            evidence.append(f"{sdk_path} exists but hipInfo.exe wasn't found in its bin directory")
+        if sdk_path and env_path and bin_dir.lower() not in env_path.lower():
+            score += 20
+            evidence.append(f"{bin_dir} is not in PATH")
+    else:
+        rocm_path = _g(exam, "rocm_path", default="")
+        rocminfo_present = _g(exam, "rocminfo_present", default=None)
+        bin_dir = f"{rocm_path}/bin" if rocm_path else "/opt/rocm/bin"
+        if rocm_path and rocminfo_present is False:
+            score += 50
+            evidence.append(f"{rocm_path} exists but `rocminfo` is not on PATH")
+        if rocm_path and env_path and bin_dir not in env_path:
+            score += 20
+            evidence.append(f"{bin_dir} is not in $PATH")
+
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_PATH_MISSING)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-6-path", title="ROCm not on PATH", score=0)
+
+    if os_family == "windows":
+        fix = Fix(
+            summary=f"Add {bin_dir} to your User PATH and reopen the shell.",
+            commands=[
+                f'setx PATH "%PATH%;{bin_dir}"',
+                "# Or: System Properties -> Environment Variables -> Path -> Edit -> New.",
+                "# `setx` only affects NEW shells; close and reopen this terminal afterwards.",
+            ],
+            fix_id="fix-6-path",
+            auto_applicable=True,
+            verify=f'powershell -NoProfile -Command "& \\"{bin_dir}\\hipInfo.exe\\" | Select-Object -First 5"',
+        )
+    else:
+        fix = Fix(
+            summary=f"Add {bin_dir} to PATH for this shell and persist in your shell rc.",
+            commands=[
+                f"export PATH={bin_dir}:$PATH",
+                f"echo 'export PATH={bin_dir}:$PATH' >> ~/.bashrc   # or ~/.zshrc",
+            ],
+            fix_id="fix-6-path",
+            auto_applicable=True,
+            verify="rocminfo | head -n 5 && hipcc --version",
+        )
+    return Diagnosis(
+        id="fix-6-path",
+        title="ROCm/HIP binaries not on PATH after install",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_7_stale_repos(exam: dict, symptom: str) -> Diagnosis:
+    """Stale or conflicting APT/DNF repos from prior installer runs."""
+    score = 0
+    evidence: list[str] = []
+    repos = _g(exam, "rocm_repos_seen", default=[]) or []
+    # Two or more ROCm repo files is the usual smoking gun (often one from
+    # the old amdgpu-install pin and one from a fresh radeon.com line).
+    if len(repos) >= 2:
+        score += 40
+        evidence.append(
+            f"{len(repos)} ROCm/AMDGPU repo files present: {repos}"
+        )
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_REPO_BROKEN)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-7-stale-repos", title="Stale ROCm repos", score=0)
+
+    commands = ["ls /etc/apt/sources.list.d/ | grep -iE 'rocm|amdgpu|radeon' || true"]
+    for r in repos:
+        commands.append(f"# sudo mv {r} {r}.bak     # quarantine, do not delete yet")
+    commands.append("sudo apt update")
+    commands.append("# If apt now resolves, reinstall via the correct method only:")
+    commands.append("#   amdgpu-install --usecase=rocm,hip --no-dkms   # if you want amdgpu-install")
+    commands.append("#   or use the distro packages exclusively")
+    fix = Fix(
+        summary=(
+            "Quarantine duplicate ROCm/AMDGPU repo files and resolve apt before "
+            "re-running any installer."
+        ),
+        commands=commands,
+        needs_sudo=True,
+        fix_id="fix-7-stale-repos",
+        auto_applicable=False,
+        verify="sudo apt update 2>&1 | tail -n 20",
+    )
+    return Diagnosis(
+        id="fix-7-stale-repos",
+        title="Stale or conflicting APT/DNF repos from prior installer runs",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_8_wheel_rocm_mismatch(exam: dict, symptom: str) -> Diagnosis:
+    """Framework wheel built for a different ROCm major than the system."""
+    score = 0
+    evidence: list[str] = []
+    os_family = _g(exam, "os_family", default="linux")
+    fw_rocm = _g(exam, "framework_rocm_version", default="")
+    if os_family == "windows":
+        sys_rocm = _g(exam, "hip_sdk_version", default="")
+    else:
+        sys_rocm = _g(exam, "rocm_version", default="")
+
+    def _major(s: str) -> str | None:
+        m = re.search(r"(\d+)\.(\d+)", s)
+        return f"{m.group(1)}.{m.group(2)}" if m else None
+
+    fw_major = _major(fw_rocm)
+    sys_major = _major(sys_rocm)
+    if fw_major and sys_major and fw_major != sys_major:
+        score += 50
+        runtime = "HIP SDK" if os_family == "windows" else "ROCm"
+        evidence.append(
+            f"Framework links HIP {fw_major} but system {runtime} is {sys_major}"
+        )
+
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_LIB_MISMATCH)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-8-wheel-rocm", title="Wheel/ROCm mismatch", score=0)
+
+    if os_family == "windows":
+        fix = Fix(
+            summary=(
+                "Reinstall the framework against the HIP SDK major you have "
+                "installed (or install the HIP SDK major the wheel needs)."
+            ),
+            commands=[
+                "pip uninstall -y torch torchvision torchaudio",
+                "# TheRock publishes Windows ROCm wheels per HIP SDK release:",
+                "#   https://github.com/ROCm/TheRock",
+                "# Match the wheel index to the HIP SDK major you have on disk.",
+                "python -c \"import torch; print(torch.__version__, torch.version.hip)\"",
+            ],
+            fix_id="fix-8-wheel-rocm",
+            auto_applicable=False,
+            verify="python -c \"import torch; print(torch.cuda.is_available(), torch.version.hip)\"",
+        )
+    else:
+        fix = Fix(
+            summary=(
+                "Reinstall the framework from the wheel index that matches the "
+                "system ROCm major (or upgrade the system ROCm to match the wheel)."
+            ),
+            commands=[
+                "pip uninstall -y torch torchvision torchaudio",
+                "# Pick the index that matches your system ROCm major. Examples:",
+                "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4",
+                "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3",
+                "# Then re-check:",
+                "python -c \"import torch; print(torch.__version__, torch.version.hip)\"",
+            ],
+            fix_id="fix-8-wheel-rocm",
+            auto_applicable=False,
+            verify="python -c \"import torch; print(torch.cuda.is_available(), torch.version.hip)\"",
+        )
+    return Diagnosis(
+        id="fix-8-wheel-rocm",
+        title="Framework wheel built for a different ROCm major than the system",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_9_igpu_dgpu_collision(exam: dict, symptom: str) -> Diagnosis:
+    """iGPU enumerated alongside dGPU and crashing the runtime."""
+    has_apu = _g(exam, "has_apu", default=False)
+    has_discrete = _g(exam, "has_discrete_amd", default=False)
+    if not (has_apu and has_discrete):
+        return Diagnosis(id="fix-9-igpu-dgpu", title="iGPU+dGPU collision", score=0)
+
+    env = _g(exam, "env", default={}) or {}
+    visible = env.get("HIP_VISIBLE_DEVICES") or env.get("ROCR_VISIBLE_DEVICES")
+    score = 40
+    evidence = ["machine has both an AMD APU and an AMD discrete GPU"]
+    if not visible:
+        score += 25
+        evidence.append("HIP_VISIBLE_DEVICES is unset; runtime sees BOTH GPUs")
+    # Crashes are vague but a crash on a dual-GPU box is the classic signal.
+    if symptom and re.search(r"(crash|segfault|signal 11)", symptom, re.IGNORECASE):
+        score += 15
+        evidence.append("user mentions a crash / segfault")
+
+    gfx_targets = _amd_gfx_targets(exam)
+    if _g(exam, "os_family", default="linux") == "windows":
+        fix = Fix(
+            summary=(
+                "Pin the HIP runtime to the discrete GPU with HIP_VISIBLE_DEVICES "
+                "so the iGPU is hidden."
+            ),
+            commands=[
+                "# Confirm which index is the dGPU (hipInfo.exe output order):",
+                '& "$env:HIP_PATH\\bin\\hipInfo.exe" | Select-String "device#|Name|gcnArchName"',
+                "# Then persist HIP_VISIBLE_DEVICES in the User environment:",
+                "setx HIP_VISIBLE_DEVICES 1",
+                "# `setx` only takes effect in NEW shells; reopen the terminal.",
+            ],
+            fix_id="fix-9-igpu-dgpu",
+            auto_applicable=True,
+            verify=(
+                'powershell -NoProfile -Command "$env:HIP_VISIBLE_DEVICES=1; '
+                'python -c \\"import torch; print(torch.cuda.device_count())\\""'
+            ),
+            notes=[
+                f"Detected gfx targets: {gfx_targets}. The dGPU is usually the higher-numbered family (gfx11xx).",
+            ],
+        )
+    else:
+        fix = Fix(
+            summary=(
+                "Pin the runtime to the discrete GPU with HIP_VISIBLE_DEVICES "
+                "so the iGPU is hidden."
+            ),
+            commands=[
+                "# Confirm which index is the dGPU (`rocminfo` output order):",
+                "rocminfo | grep -E 'Agent |gfx|Marketing'",
+                "# Then pin HIP to the dGPU (typically index 1 when an APU is index 0):",
+                "export HIP_VISIBLE_DEVICES=1",
+                "# Persist in your shell rc or your launch script.",
+            ],
+            fix_id="fix-9-igpu-dgpu",
+            auto_applicable=False,
+            verify="HIP_VISIBLE_DEVICES=1 python -c \"import torch; print(torch.cuda.device_count())\"",
+            notes=[
+                f"Detected gfx targets: {gfx_targets}. The dGPU is usually the higher-numbered family (gfx11xx).",
+            ],
+        )
+    return Diagnosis(
+        id="fix-9-igpu-dgpu",
+        title="iGPU enumerated alongside dGPU and destabilising the runtime",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_10_container_devices(exam: dict, symptom: str) -> Diagnosis:
+    """Container can't see /dev/kfd or /dev/dri/renderD*."""
+    in_container = _g(exam, "in_container", default=False)
+    if not in_container:
+        return Diagnosis(id="fix-10-container", title="Container missing devices", score=0)
+
+    score = 25
+    evidence = [f"running inside a {_g(exam, 'container_kind', default='container')}"]
+    kfd = _g(exam, "kfd", default=None) or {}
+    if kfd.get("exists") is False:
+        score += 40
+        evidence.append("/dev/kfd is not present in the container")
+    elif kfd.get("user_can_write") is False:
+        score += 30
+        evidence.append("/dev/kfd is present but not writable by the container user")
+    if not _g(exam, "render_devices", default=[]):
+        score += 20
+        evidence.append("no /dev/dri/renderD* visible in the container")
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_CONTAINER)
+    score += kw_score
+    evidence += kw_ev
+
+    fix = Fix(
+        summary=(
+            "Re-launch the container with the AMD devices and the render group "
+            "passed through."
+        ),
+        commands=[
+            "# Docker / Podman flags AMD-recommends:",
+            "docker run --rm -it \\",
+            "  --device=/dev/kfd \\",
+            "  --device=/dev/dri \\",
+            "  --group-add render \\",
+            "  --security-opt seccomp=unconfined \\",
+            "  --shm-size=8g \\",
+            "  rocm/pytorch:latest",
+            "# Rootless podman: also pass `--userns=keep-id` and ensure the",
+            "# host user is in the render group; podman maps it through.",
+        ],
+        fix_id="fix-10-container",
+        auto_applicable=False,
+        verify="rocminfo | head -n 5",
+        notes=[
+            "Use rocm/pytorch or rocm/dev-ubuntu-22.04 as a known-good image. "
+            "Mixing host ROCm + container ROCm versions is a separate footgun.",
+        ],
+    )
+    return Diagnosis(
+        id="fix-10-container",
+        title="Container can't see /dev/kfd or /dev/dri/renderD*",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_11_iommu_hang(exam: dict, symptom: str) -> Diagnosis:
+    """Multi-GPU hang on systems with IOMMU enabled."""
+    amd_count = len(_amd_gpus(exam))
+    if amd_count < 2:
+        return Diagnosis(id="fix-11-iommu", title="Multi-GPU IOMMU hang", score=0)
+
+    score = 0
+    evidence = [f"{amd_count} AMD GPUs detected"]
+    iommu = _g(exam, "iommu_kernel_param", default="")
+    if iommu and iommu != "pt":
+        score += 25
+        evidence.append(f"kernel cmdline has iommu={iommu} (not 'pt')")
+    if not iommu:
+        # IOMMU is on by default on most modern BIOSes even without the
+        # kernel cmdline flag. A multi-GPU hang is still the classic signal.
+        score += 10
+        evidence.append("no iommu= flag on kernel cmdline (default may be 'on')")
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_IOMMU_HANG)
+    score += kw_score
+    evidence += kw_ev
+
+    if score < 25:
+        return Diagnosis(id="fix-11-iommu", title="Multi-GPU IOMMU hang", score=0)
+
+    fix = Fix(
+        summary=(
+            "Add `iommu=pt` to the kernel command line so DMA goes through "
+            "pass-through mode. This requires editing GRUB and rebooting."
+        ),
+        commands=[
+            "# Inspect the current cmdline:",
+            "cat /proc/cmdline",
+            "# Edit /etc/default/grub and add iommu=pt to GRUB_CMDLINE_LINUX_DEFAULT:",
+            "sudo $EDITOR /etc/default/grub",
+            "sudo update-grub                # Debian/Ubuntu",
+            "sudo grub2-mkconfig -o /boot/grub2/grub.cfg   # Fedora/RHEL",
+            "# Reboot for the change to take effect, then retry the multi-GPU job.",
+        ],
+        needs_sudo=True,
+        needs_reboot=True,
+        fix_id="fix-11-iommu",
+        auto_applicable=False,
+        verify="cat /proc/cmdline | grep -o 'iommu=\\w*'",
+    )
+    return Diagnosis(
+        id="fix-11-iommu",
+        title="Multi-GPU hang on systems with IOMMU enabled",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_12_amdgpu_install_broken(exam: dict, symptom: str) -> Diagnosis:
+    """amdgpu-install left a broken DKMS / repo state."""
+    score = 0
+    evidence: list[str] = []
+    method = _g(exam, "rocm_install_method", default="")
+    if method == "amdgpu-install":
+        evidence.append("ROCm was installed via amdgpu-install")
+    else:
+        # Not a hard requirement; users sometimes hit this after the
+        # installer fails and they don't realize they did one. Don't add
+        # base score, but allow keyword evidence to count.
+        pass
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_DPKG_BROKEN)
+    score += kw_score
+    evidence += kw_ev
+    if method == "amdgpu-install" and kw_score > 0:
+        score += 20
+
+    if score <= 0:
+        return Diagnosis(id="fix-12-installer", title="amdgpu-install broken state", score=0)
+
+    fix = Fix(
+        summary=(
+            "Run amdgpu-install's documented uninstall sequence to clear the "
+            "half-configured state, THEN reinstall without the flag that broke it."
+        ),
+        commands=[
+            "sudo amdgpu-install --uninstall",
+            "sudo apt autoremove --purge -y",
+            "sudo apt update",
+            "# Reinstall. Drop --accept-eula if you used it previously; the",
+            "# newer installer rejects it and leaves a half-configured repo.",
+            "sudo amdgpu-install --usecase=rocm,hip",
+        ],
+        needs_sudo=True,
+        needs_reboot=True,
+        fix_id="fix-12-installer",
+        auto_applicable=False,
+        verify="dpkg -l | grep -E 'rocm|amdgpu' | head -n 20 && rocminfo | head -n 5",
+        notes=[
+            "If `apt autoremove` warns it will remove unrelated packages, stop "
+            "and resolve those by hand before continuing.",
+        ],
+    )
+    return Diagnosis(
+        id="fix-12-installer",
+        title="amdgpu-install left a broken state (repo regression / partial DKMS)",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_13_hip_sdk_missing(exam: dict, symptom: str) -> Diagnosis:
+    """Windows: framework imports HIP but the HIP SDK isn't installed."""
+    if _g(exam, "os_family", default="") != "windows":
+        return Diagnosis(id="fix-13-hip-sdk-missing", title="HIP SDK not installed", score=0)
+
+    score = 0
+    evidence: list[str] = []
+    sdk_path = _g(exam, "hip_sdk_path", default="")
+    hipinfo_present = _g(exam, "hipinfo_present", default=None)
+    framework = _g(exam, "framework", default="")
+    fw_rocm = _g(exam, "framework_rocm_version", default="")
+    has_amd = _g(exam, "has_amd_gpu", default=False)
+
+    if not sdk_path:
+        score += 35
+        evidence.append("No HIP SDK install found under C:\\Program Files\\AMD\\ROCm")
+    elif hipinfo_present is False:
+        score += 30
+        evidence.append(f"HIP SDK at {sdk_path} but hipInfo.exe is missing from its bin directory")
+
+    if has_amd and framework == "pytorch" and fw_rocm.startswith("hip="):
+        score += 25
+        evidence.append("PyTorch is a HIP build but the HIP SDK is not present on this host")
+
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_HIP_SDK_MISSING)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-13-hip-sdk-missing", title="HIP SDK not installed", score=0)
+
+    fix = Fix(
+        summary=(
+            "Install the AMD HIP SDK for Windows; the HIP runtime DLLs and "
+            "hipInfo.exe come from there."
+        ),
+        commands=[
+            "# Download and install the HIP SDK (matched to your framework's HIP major):",
+            "#   https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html",
+            "# After install, reopen the shell so HIP_PATH and PATH pick up the new install.",
+        ],
+        fix_id="fix-13-hip-sdk-missing",
+        auto_applicable=False,
+        verify=(
+            'powershell -NoProfile -Command '
+            '"& \\"$env:HIP_PATH\\bin\\hipInfo.exe\\" | Select-Object -First 5"'
+        ),
+        notes=[
+            "If you only need PyTorch on Windows AMD and don't need the C/C++ "
+            "HIP toolchain, the TheRock wheels bundle their own HIP runtime "
+            "and may not require a system HIP SDK install.",
+        ],
+    )
+    return Diagnosis(
+        id="fix-13-hip-sdk-missing",
+        title="HIP SDK not installed (Windows)",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_14_adrenalin_too_old(exam: dict, symptom: str) -> Diagnosis:
+    """Windows: HIP SDK present but Adrenalin (kernel-mode driver) is too old.
+
+    We deliberately do NOT hardcode a minimum Adrenalin version here: AMD
+    bumps the HIP SDK <-> Adrenalin pairing every release, and the live
+    table goes stale within months. Instead we trigger on observable
+    failure patterns (HIP SDK present + hipInfo unable to enumerate, or
+    user pasted a 'Driver version too old' style symptom) and route the
+    user to the live release notes.
+    """
+    if _g(exam, "os_family", default="") != "windows":
+        return Diagnosis(id="fix-14-adrenalin-too-old", title="Adrenalin driver too old", score=0)
+
+    score = 0
+    evidence: list[str] = []
+    sdk_path = _g(exam, "hip_sdk_path", default="")
+    hipinfo_present = _g(exam, "hipinfo_present", default=False)
+    hipinfo_status = _g(exam, "hipinfo_status", default="")
+    adrenalin = _g(exam, "adrenalin_version", default="")
+
+    if sdk_path and hipinfo_present and hipinfo_status not in ("ok", ""):
+        score += 35
+        evidence.append(
+            f"HIP SDK at {sdk_path} is installed but hipInfo.exe reports {hipinfo_status!r}; "
+            "this typically means the kernel-mode driver doesn't match the SDK."
+        )
+    if adrenalin:
+        evidence.append(f"Adrenalin / kernel-mode driver version: {adrenalin}")
+
+    if symptom and re.search(r"driver.*(too old|out of date|unsupported)", symptom, re.IGNORECASE):
+        score += 35
+        evidence.append("error mentions 'driver too old / out of date / unsupported'")
+    if symptom and re.search(r"hsa.*invalid agent|no agents (were )?found", symptom, re.IGNORECASE):
+        score += 25
+        evidence.append("HSA error suggests driver/runtime can't enumerate the GPU")
+
+    if score <= 0:
+        return Diagnosis(id="fix-14-adrenalin-too-old", title="Adrenalin driver too old", score=0)
+
+    fix = Fix(
+        summary=(
+            "Update the AMD Adrenalin (or PRO) graphics driver to the version "
+            "the HIP SDK release notes call out as the supported pairing."
+        ),
+        commands=[
+            "# Cross-check the HIP SDK release notes for the exact driver pairing:",
+            "#   https://rocm.docs.amd.com/projects/install-on-windows/en/latest/install/install.html",
+            "# Then download the matching driver from:",
+            "#   https://www.amd.com/en/support",
+            "# Reboot after the install for the kernel-mode driver to take effect.",
+        ],
+        needs_reboot=True,
+        fix_id="fix-14-adrenalin-too-old",
+        auto_applicable=False,
+        verify=(
+            'powershell -NoProfile -Command '
+            '"(Get-CimInstance Win32_VideoController | '
+            "Where-Object { $_.Name -like '*AMD*' -or $_.Name -like '*Radeon*' } | "
+            'Select-Object -First 1).DriverVersion"'
+        ),
+    )
+    return Diagnosis(
+        id="fix-14-adrenalin-too-old",
+        title="Adrenalin / kernel-mode driver too old for the installed HIP SDK",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+def check_15_msvc_redist(exam: dict, symptom: str) -> Diagnosis:
+    """Windows: MSVC runtime DLL missing -- HIP DLLs fail to load."""
+    if _g(exam, "os_family", default="") != "windows":
+        return Diagnosis(id="fix-15-msvc-redist", title="MSVC runtime missing", score=0)
+
+    score = 0
+    evidence: list[str] = []
+    redist = _g(exam, "msvc_redist_present", default=None)
+    if redist is False:
+        score += 45
+        evidence.append("vcruntime140.dll / vcruntime140_1.dll not resolvable on PATH")
+
+    kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_MSVC_REDIST)
+    score += kw_score
+    evidence += kw_ev
+
+    if score <= 0:
+        return Diagnosis(id="fix-15-msvc-redist", title="MSVC runtime missing", score=0)
+
+    fix = Fix(
+        summary=(
+            "Install the Microsoft Visual C++ 2015-2022 redistributable so the "
+            "HIP SDK's amdhip64_*.dll can load."
+        ),
+        commands=[
+            "# Download & install (x64):",
+            "#   https://aka.ms/vs/17/release/vc_redist.x64.exe",
+            "# After the install, reopen the shell and re-run your import / hipInfo check.",
+        ],
+        fix_id="fix-15-msvc-redist",
+        auto_applicable=False,
+        verify=(
+            "where vcruntime140.dll && where vcruntime140_1.dll"
+        ),
+        notes=[
+            "If installing the redistributable still leaves a missing-DLL error, "
+            "the failing DLL is probably amdhip64_X.dll itself; that points at "
+            "fix-13-hip-sdk-missing (the HIP SDK install) rather than this fix.",
+        ],
+    )
+    return Diagnosis(
+        id="fix-15-msvc-redist",
+        title="MSVC runtime missing (HIP DLLs cannot load)",
+        score=min(score, 100),
+        evidence=evidence,
+        fix=fix,
+    )
+
+
+# Each entry: (checker, frozenset of os_family values it applies to).
+# `run_all_checks` reads the running OS from the exam JSON and skips
+# checkers whose `applicable_on` doesn't include it. This keeps the OS
+# branching in one place rather than scattered through the checkers.
+CHECKERS: list[tuple[Callable[[dict, str], Diagnosis], frozenset[str]]] = [
+    (check_1_arch_not_in_wheel,       frozenset({"linux", "windows"})),
+    (check_2_hsa_override_unneeded,   frozenset({"linux", "windows"})),
+    (check_3_rocm_kernel_unsupported, frozenset({"linux"})),
+    (check_4_render_group,            frozenset({"linux"})),
+    (check_5_amdgpu_blacklisted,      frozenset({"linux"})),
+    (check_6_path_missing,            frozenset({"linux", "windows"})),
+    (check_7_stale_repos,             frozenset({"linux"})),
+    (check_8_wheel_rocm_mismatch,     frozenset({"linux", "windows"})),
+    (check_9_igpu_dgpu_collision,     frozenset({"linux", "windows"})),
+    (check_10_container_devices,      frozenset({"linux"})),
+    (check_11_iommu_hang,             frozenset({"linux"})),
+    (check_12_amdgpu_install_broken,  frozenset({"linux"})),
+    (check_13_hip_sdk_missing,        frozenset({"windows"})),
+    (check_14_adrenalin_too_old,      frozenset({"windows"})),
+    (check_15_msvc_redist,            frozenset({"windows"})),
+]
+
+
+def run_all_checks(exam: dict, symptom: str) -> list[Diagnosis]:
+    """Run every applicable checker, drop zero-score results, sort by score desc.
+
+    Checkers whose `applicable_on` set doesn't include the running OS are
+    skipped silently (they were never going to score against this exam).
+    """
+    os_family = _g(exam, "os_family", default="linux")
+    results: list[Diagnosis] = []
+    for fn, applicable_on in CHECKERS:
+        if os_family not in applicable_on:
+            continue
+        try:
+            d = fn(exam, symptom or "")
+        except Exception as exc:  # checker bug should not kill diagnose
+            results.append(Diagnosis(
+                id=f"checker-error-{fn.__name__}",
+                title=f"Internal checker error in {fn.__name__}",
+                score=0,
+                evidence=[f"{type(exc).__name__}: {exc}"],
+            ))
+            continue
+        if d.score > 0:
+            results.append(d)
+    results.sort(key=lambda d: d.score, reverse=True)
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Output
+# ---------------------------------------------------------------------------
+
+def _route_when_no_match(exam: dict) -> dict:
+    """Pick the right upstream tracker for the user's framework."""
+    fw = _g(exam, "framework", default="unknown")
+    target = {
+        "pytorch": "pytorch",
+        "llama-cpp": "llama-cpp",
+        "lemonade": "lemonade",
+        "ollama": "ollama",
+        "lm-studio": "lm-studio",
+    }.get(fw, "rocm-core")
+    return {"target": target, "url": UPSTREAM_TRACKERS[target]}
+
+
+def _print_human(diagnoses: list[Diagnosis], exam: dict, top: int) -> None:
+    if not diagnoses:
+        route = _route_when_no_match(exam)
+        print("rocm-doctor: no known misconfiguration matched.")
+        print()
+        print(
+            "This is the explicit 'I don't recognise this failure mode' case. "
+            "Do not speculate; file the symptom + this examination output upstream:"
+        )
+        print(f"  {route['target']:>12s}: {route['url']}")
+        print()
+        print("Include the JSON from `python scripts/examine.py --json` in your report.")
+        return
+
+    for i, d in enumerate(diagnoses[:top], 1):
+        tier = "HIGH" if d.score >= HIGH_CONFIDENCE else (
+            "LIKELY" if d.score >= MIN_SCORE_FOR_MATCH else "WEAK"
+        )
+        print(f"#{i} [{tier} score={d.score}/100] {d.title}")
+        print(f"   id: {d.id}")
+        for e in d.evidence:
+            print(f"   - {e}")
+        if d.fix:
+            print(f"   plan: {d.fix.summary}")
+            for c in d.fix.commands:
+                print(f"     $ {c}")
+            flags = []
+            if d.fix.needs_sudo: flags.append("sudo")
+            if d.fix.needs_reboot: flags.append("reboot required")
+            if d.fix.needs_relogin: flags.append("re-login required")
+            if d.fix.auto_applicable: flags.append("apply_fix.py can run it")
+            if flags:
+                print(f"   flags: {', '.join(flags)}")
+            for n in d.fix.notes:
+                print(f"   note: {n}")
+            if d.fix.verify:
+                print(f"   verify after fix: {d.fix.verify}")
+        print()
+
+    high = [d for d in diagnoses if d.score >= HIGH_CONFIDENCE]
+    if high:
+        print(f"Next step: propose `apply_fix.py --fix-id {high[0].id}` to the user.")
+    else:
+        print(
+            "Highest-scoring match is below the HIGH_CONFIDENCE threshold. "
+            "Confirm one more piece of evidence with the user before applying."
+        )
+
+
+def _to_jsonable(diagnoses: list[Diagnosis], exam: dict) -> dict:
+    return {
+        "matched": [asdict(d) for d in diagnoses],
+        "min_score_for_match": MIN_SCORE_FOR_MATCH,
+        "high_confidence_threshold": HIGH_CONFIDENCE,
+        "route_when_no_match": _route_when_no_match(exam),
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--exam", type=Path, required=True,
+        help="Path to the JSON produced by `examine.py --json`.",
+    )
+    parser.add_argument(
+        "--symptom", default="",
+        help="Raw error text from the user; symptom-keyword scoring uses it.",
+    )
+    parser.add_argument(
+        "--top", type=int, default=5,
+        help="Show at most this many matching diagnoses (default 5).",
+    )
+    parser.add_argument("--json", action="store_true",
+                        help="Emit machine-readable JSON instead of the human view.")
+    args = parser.parse_args(argv)
+
+    if not args.exam.exists():
+        print(f"exam file not found: {args.exam}", file=sys.stderr)
+        return 2
+    try:
+        exam = json.loads(args.exam.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        print(f"exam file is not valid JSON: {exc}", file=sys.stderr)
+        return 2
+
+    diagnoses = run_all_checks(exam, args.symptom)
+    matched = [d for d in diagnoses if d.score >= MIN_SCORE_FOR_MATCH]
+    if args.json:
+        print(json.dumps(_to_jsonable(diagnoses, exam), indent=2))
+    else:
+        _print_human(diagnoses, exam, args.top)
+    return 0 if matched else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/rocm-doctor/scripts/examine.py b/skills/rocm-doctor/scripts/examine.py
new file mode 100644
index 0000000..844dc98
--- /dev/null
+++ b/skills/rocm-doctor/scripts/examine.py
@@ -0,0 +1,1268 @@
+#!/usr/bin/env -S uv run --quiet
+# /// script
+# requires-python = ">=3.10"
+# dependencies = []
+# ///
+"""Read-only system examination for the `rocm-doctor` skill.
+
+This is the first script the skill runs once it has decided the user's
+framework actually touches the system ROCm install (so: PyTorch, llama.cpp,
+and anything built against `/opt/rocm` on Linux or the HIP SDK on Windows,
+but NOT Lemonade / LM Studio / Ollama, which ship their own runtime).
+
+The script collects the minimum set of facts needed to disambiguate
+every known misconfiguration in `reference.md`. It never installs or
+removes packages, never changes group membership, and never edits files.
+
+Supported platforms:
+  - Linux (native): full Linux probe set.
+  - Windows: HIP SDK + Adrenalin probes (no /sys, no rocminfo; uses
+    Win32_VideoController and hipInfo.exe instead).
+  - WSL2: detected and refused with a route-out message. The ROCm-on-WSL
+    flow needs Adrenalin Pro + the WSL kernel update on the Windows host
+    and is not in this catalog.
+
+Exit codes:
+  0 = examination ran; results emitted. The agent should pass the JSON to
+      `diagnose.py` next.
+  2 = wrong platform (WSL, neither Linux nor Windows, or no AMD GPU). The
+      agent should stop and route the user instead of running diagnose.
+  3 = examination ran but something prevented a key probe from completing
+      and the agent should warn the user before continuing.
+
+Usage:
+    python scripts/examine.py
+    python scripts/examine.py --json
+    python scripts/examine.py --framework pytorch
+    python scripts/examine.py --framework llama-cpp --json
+
+The optional `--framework` flag scopes the framework-specific probes
+(e.g. running PyTorch's `torch.version.hip`). When omitted the script
+probes everything it can detect without launching a Python interpreter
+for a framework that may not be installed.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import platform
+import re
+import shutil
+import stat
+import subprocess
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+
+# Environment variables that silently change ROCm/HIP behaviour. We record
+# every one of these, even when empty, so `diagnose.py` can see both the
+# value and the fact that it is unset (which is itself a signal for some
+# misconfigurations -- e.g. ROCM_PATH being unset is fine, but the user
+# having set HSA_OVERRIDE_GFX_VERSION on a supported GPU is suspicious).
+TRACKED_ENV_VARS = (
+    "HSA_OVERRIDE_GFX_VERSION",
+    "HIP_VISIBLE_DEVICES",
+    "ROCR_VISIBLE_DEVICES",
+    "CUDA_VISIBLE_DEVICES",          # PyTorch HIP also honours this name.
+    "GPU_DEVICE_ORDINAL",
+    "ROCM_PATH",
+    "ROCM_HOME",
+    "HIP_PATH",                      # Windows: HIP SDK install root (e.g. C:\Program Files\AMD\ROCm\6.4\).
+    "HIP_PLATFORM",                  # Windows: usually "amd"; "nvidia" means user is on the wrong toolchain.
+    "PYTORCH_ROCM_ARCH",
+    "HCC_AMDGPU_TARGET",
+    "AMDGPU_TARGETS",
+    "LD_LIBRARY_PATH",
+    "PATH",
+)
+
+# Files the amdgpu-install pipeline drops on APT-based systems. Presence
+# of these tells us "installed via amdgpu-install", absence + apt-installed
+# ROCm packages tells us "installed via plain apt", and absence of both
+# with a populated /opt/rocm typically means "tarball or pip wheel".
+AMDGPU_INSTALL_MARKERS = (
+    "/etc/apt/sources.list.d/amdgpu.list",
+    "/etc/apt/sources.list.d/rocm.list",
+    "/etc/apt/sources.list.d/radeon.list",
+    "/etc/yum.repos.d/amdgpu.repo",
+    "/etc/yum.repos.d/rocm.repo",
+)
+
+# Containers we can detect cheaply from /proc/1/cgroup or marker files.
+CONTAINER_MARKERS = {
+    "/.dockerenv": "docker",
+    "/run/.containerenv": "podman",
+}
+
+
+@dataclass
+class GPU:
+    name: str = ""
+    gfx_target: str = ""        # e.g. gfx1151
+    pci_id: str = ""
+    is_apu: bool | None = None
+    is_amd: bool = False
+
+
+@dataclass
+class Device:
+    path: str
+    exists: bool
+    mode: str = ""              # e.g. "crw-rw----"
+    owner_user: str = ""
+    owner_group: str = ""
+    user_can_read: bool | None = None
+    user_can_write: bool | None = None
+
+
+@dataclass
+class Examination:
+    # --- platform ---
+    os_family: str = "unknown"          # linux | windows | other
+    os_version: str = ""
+    distro_id: str = ""                 # ubuntu, debian, rhel, fedora, ...
+    distro_version: str = ""
+    kernel_release: str = ""
+    kernel_cmdline: str = ""
+    is_wsl: bool = False                # True iff running inside WSL2 (out of scope; see notes).
+
+    # --- hardware ---
+    cpu_vendor: str = "unknown"
+    cpu_model: str = ""
+    gpus: list[GPU] = field(default_factory=list)
+    has_amd_gpu: bool = False
+    has_nvidia_gpu: bool = False
+    has_apu: bool = False
+    has_discrete_amd: bool = False
+
+    # --- driver / runtime (Linux) ---
+    amdgpu_loaded: bool | None = None
+    amdgpu_blacklisted_in: list[str] = field(default_factory=list)
+    amdkfd_loaded: bool | None = None
+    secure_boot: str = "unknown"        # enabled | disabled | unknown
+    iommu_kernel_param: str = ""        # value of iommu=, empty if unset
+    kfd: Device | None = None
+    render_devices: list[Device] = field(default_factory=list)
+
+    # --- user / groups (Linux) ---
+    user_name: str = ""
+    user_groups: list[str] = field(default_factory=list)
+    in_render_group: bool | None = None
+    in_video_group: bool | None = None
+
+    # --- ROCm install (Linux) ---
+    rocm_version: str = ""              # e.g. 6.4.1
+    rocm_install_method: str = ""       # amdgpu-install | apt | dnf | pip-only | unknown | none
+    rocm_path: str = ""                 # /opt/rocm typically
+    rocminfo_present: bool = False
+    rocminfo_status: str = ""           # ok | not-loaded | permission-denied | missing
+    hip_libs_on_ld_path: bool | None = None
+    rocm_repos_seen: list[str] = field(default_factory=list)
+
+    # --- HIP SDK install (Windows) ---
+    hip_sdk_path: str = ""              # e.g. C:\Program Files\AMD\ROCm\6.4\
+    hip_sdk_version: str = ""           # e.g. 6.4 (parsed from the install dir)
+    hipinfo_present: bool = False
+    hipinfo_status: str = ""            # ok | error rc=N | missing
+    adrenalin_version: str = ""         # Win32_VideoController.DriverVersion (e.g. 32.0.11020.5)
+    msvc_redist_present: bool | None = None  # vcruntime140 / vcruntime140_1 resolvable
+
+    # --- framework ---
+    framework: str = "unknown"          # pytorch | llama-cpp | unknown | skipped
+    framework_version: str = ""
+    framework_rocm_version: str = ""    # e.g. PyTorch's torch.version.hip
+    framework_arch_list: list[str] = field(default_factory=list)
+    framework_notes: list[str] = field(default_factory=list)
+
+    # --- environment ---
+    env: dict[str, str] = field(default_factory=dict)
+
+    # --- container ---
+    in_container: bool = False
+    container_kind: str = ""
+
+    # --- evidence captured for diagnose.py ---
+    dmesg_amdgpu_tail: list[str] = field(default_factory=list)
+    notes: list[str] = field(default_factory=list)
+    probe_failures: list[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Shell helpers (never raise)
+# ---------------------------------------------------------------------------
+
+def _run(cmd: list[str], timeout: float = 5.0) -> tuple[int, str, str]:
+    """Run `cmd`; return (rc, stdout, stderr). Never raises."""
+    try:
+        r = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout, check=False,
+        )
+        return r.returncode, r.stdout or "", r.stderr or ""
+    except (FileNotFoundError, subprocess.SubprocessError, OSError):
+        return 127, "", ""
+
+
+def _read_text(path: str) -> str:
+    try:
+        return Path(path).read_text(encoding="utf-8", errors="replace")
+    except OSError:
+        return ""
+
+
+def _have(cmd: str) -> bool:
+    return shutil.which(cmd) is not None
+
+
+# ---------------------------------------------------------------------------
+# Platform probes
+# ---------------------------------------------------------------------------
+
+def _probe_os(e: Examination) -> None:
+    sysname = platform.system().lower()
+    e.os_version = platform.platform()
+    if sysname == "linux":
+        e.os_family = "linux"
+        e.kernel_release = platform.release()
+        e.kernel_cmdline = _read_text("/proc/cmdline").strip()
+        # /etc/os-release is the standard for distro identity since 2012.
+        osr = _read_text("/etc/os-release")
+        for line in osr.splitlines():
+            if "=" not in line:
+                continue
+            k, v = line.split("=", 1)
+            v = v.strip().strip('"')
+            if k == "ID":
+                e.distro_id = v
+            elif k == "VERSION_ID":
+                e.distro_version = v
+        m = re.search(r"\biommu=(\w+)", e.kernel_cmdline)
+        if m:
+            e.iommu_kernel_param = m.group(1)
+        # WSL2 advertises itself in /proc/version and via the WSL_DISTRO_NAME
+        # env var. We treat WSL as out of scope -- the ROCm-on-WSL flow needs
+        # Adrenalin Pro + the WSL kernel update on the Windows host, not the
+        # native-Linux fixes in this catalog.
+        proc_version = _read_text("/proc/version").lower()
+        if "microsoft" in proc_version or "wsl" in proc_version or os.environ.get("WSL_DISTRO_NAME"):
+            e.is_wsl = True
+    elif sysname == "windows":
+        e.os_family = "windows"
+    else:
+        e.os_family = "other"
+
+
+def _probe_cpu(e: Examination) -> None:
+    if e.os_family == "linux":
+        txt = _read_text("/proc/cpuinfo")
+        for line in txt.splitlines():
+            if line.startswith("vendor_id") and not e.cpu_vendor or e.cpu_vendor == "unknown":
+                val = line.split(":", 1)[1].strip()
+                e.cpu_vendor = "amd" if "AMD" in val else ("intel" if "Intel" in val else val.lower())
+            if line.startswith("model name") and not e.cpu_model:
+                e.cpu_model = line.split(":", 1)[1].strip()
+            if e.cpu_vendor != "unknown" and e.cpu_model:
+                break
+    elif e.os_family == "windows":
+        rc, out, _ = _run([
+            "powershell", "-NoProfile", "-Command",
+            "(Get-CimInstance Win32_Processor | Select-Object -First 1).Name",
+        ], timeout=8)
+        if rc == 0 and out.strip():
+            e.cpu_model = out.strip().splitlines()[0].strip()
+            lname = e.cpu_model.lower()
+            e.cpu_vendor = "amd" if "amd" in lname else ("intel" if "intel" in lname else "unknown")
+        else:
+            e.probe_failures.append("Get-CimInstance Win32_Processor failed; cannot identify CPU.")
+
+
+# ---------------------------------------------------------------------------
+# GPU probes
+# ---------------------------------------------------------------------------
+
+# Strix Halo / Phoenix / Hawk Point / Strix Point marketing names commonly
+# seen in `lspci`. Used to flag the GPU as an APU when rocminfo isn't
+# available.
+_APU_KEYWORDS = (
+    "strix halo", "ryzen ai max", "phoenix", "hawk point", "strix point",
+    "krackan", "rembrandt", "raphael", "barcelo", "lucienne", "renoir",
+    "cezanne",
+)
+
+
+def _classify_amd_marketing_name(name: str) -> tuple[str, bool]:
+    """Return (best-effort gfx_target, is_apu) for an AMD GPU marketing name.
+
+    Falls back to ("", False) when we can't tell, in which case `rocminfo`
+    (Linux) or `hipInfo.exe` (Windows) output is the source of truth for
+    the gfx target.
+    """
+    # Windows reports names like "AMD Radeon(TM) 8060S Graphics"; strip the
+    # (R)/(TM)/(C) decorations and collapse whitespace so substring matches
+    # ("radeon 8060s") don't get broken by them.
+    n = re.sub(r"\(\s*(?:tm|r|c)\s*\)", " ", name.lower())
+    n = re.sub(r"\s+", " ", n).strip()
+    # Strix Halo iGPU shows up under three distinct names depending on host:
+    # the CPU package name on Linux ("Ryzen AI Max+ ..."), the iGPU adapter
+    # name on Windows ("AMD Radeon(TM) 8060S Graphics"), or the codename in
+    # docs ("Strix Halo"). All three map to gfx1151.
+    if "ryzen ai max" in n or "strix halo" in n:
+        return "gfx1151", True
+    if "radeon 8050s" in n or "radeon 8060s" in n or "radeon 8045s" in n:
+        return "gfx1151", True
+    if "radeon 880m" in n or "radeon 890m" in n or "strix point" in n or "krackan" in n:
+        return "gfx1150", True
+    if "radeon 780m" in n or "radeon 760m" in n or "radeon 740m" in n \
+            or "phoenix" in n or "hawk point" in n:
+        return "gfx1103", True
+    return "", any(kw in n for kw in _APU_KEYWORDS)
+
+
+def _probe_gpus_lspci(e: Examination) -> None:
+    """Enumerate AMD/NVIDIA display+3D controllers via lspci."""
+    if not _have("lspci"):
+        e.probe_failures.append("lspci not found; cannot enumerate PCI GPUs")
+        return
+    rc, out, _ = _run(["lspci", "-nn", "-D"], timeout=8)
+    if rc != 0:
+        e.probe_failures.append("lspci returned non-zero; PCI enumeration incomplete")
+        return
+    for line in out.splitlines():
+        # Match VGA, 3D, and Display controllers.
+        if not re.search(r"(VGA compatible controller|3D controller|Display controller)", line):
+            continue
+        pci_id = line.split()[0] if line.split() else ""
+        is_amd = "[1002" in line or "Advanced Micro Devices" in line or "AMD" in line
+        is_nvidia = "[10de" in line or "NVIDIA" in line
+        # Marketing name lives between the controller-kind colon and the
+        # `[vendor:device]` tail.
+        m = re.match(
+            r"\S+\s+(?:VGA compatible controller|3D controller|Display controller)"
+            r"\s*\[\w+\]:\s*(.+?)\s*\[[\da-f]{4}:[\da-f]{4}\]",
+            line,
+            re.IGNORECASE,
+        )
+        name = m.group(1).strip() if m else line
+        if is_nvidia:
+            e.has_nvidia_gpu = True
+            e.gpus.append(GPU(name=name, pci_id=pci_id, is_amd=False, is_apu=False))
+            continue
+        if not is_amd:
+            continue
+        gfx_guess, is_apu_guess = _classify_amd_marketing_name(name)
+        e.gpus.append(GPU(
+            name=name, gfx_target=gfx_guess, pci_id=pci_id,
+            is_apu=is_apu_guess, is_amd=True,
+        ))
+
+
+def _probe_gpus_rocminfo(e: Examination) -> None:
+    """Refine the AMD GPU list with rocminfo's authoritative gfx targets.
+
+    rocminfo's output is the ground truth for the LLVM gfx target the
+    runtime will load kernels for. When the binary is present but exits
+    non-zero we capture the failure (it's a signal in its own right --
+    e.g. "ROCk module is NOT loaded" means amdkfd isn't loaded).
+    """
+    if not _have("rocminfo"):
+        e.rocminfo_present = False
+        e.rocminfo_status = "missing"
+        return
+    e.rocminfo_present = True
+    rc, out, err = _run(["rocminfo"], timeout=15)
+    if rc != 0:
+        merged = (out + "\n" + err).lower()
+        if "rock module is not loaded" in merged:
+            e.rocminfo_status = "not-loaded"
+        elif "permission denied" in merged or "operation not permitted" in merged:
+            e.rocminfo_status = "permission-denied"
+        else:
+            e.rocminfo_status = f"error rc={rc}"
+        return
+    e.rocminfo_status = "ok"
+
+    # Parse GPU agents. rocminfo blocks look like:
+    #   Agent 2
+    #     Name:            gfx1151
+    #     Marketing Name:  AMD Radeon Graphics
+    #     Device Type:     GPU
+    gfx_targets: list[tuple[str, str]] = []
+    cur_name = ""
+    cur_marketing = ""
+    cur_is_gpu = False
+    for line in out.splitlines():
+        s = line.strip()
+        if s.startswith("Agent "):
+            if cur_is_gpu and cur_name.startswith("gfx"):
+                gfx_targets.append((cur_name, cur_marketing))
+            cur_name = ""
+            cur_marketing = ""
+            cur_is_gpu = False
+            continue
+        if s.startswith("Name:"):
+            cur_name = s.split(":", 1)[1].strip()
+        elif s.startswith("Marketing Name:"):
+            cur_marketing = s.split(":", 1)[1].strip()
+        elif s.startswith("Device Type:"):
+            cur_is_gpu = "GPU" in s
+    if cur_is_gpu and cur_name.startswith("gfx"):
+        gfx_targets.append((cur_name, cur_marketing))
+
+    if not gfx_targets:
+        return
+
+    # Reconcile with the lspci-derived list: prefer rocminfo's gfx target
+    # for any AMD entry that didn't already have one.
+    amd_entries = [g for g in e.gpus if g.is_amd]
+    for idx, (gfx, marketing) in enumerate(gfx_targets):
+        if idx < len(amd_entries):
+            amd_entries[idx].gfx_target = gfx
+            if marketing and not amd_entries[idx].name:
+                amd_entries[idx].name = marketing
+            # APU classification: gfx115x/gfx110x/gfx103x are APU families
+            # the doctor cares about. The rest are discrete.
+            amd_entries[idx].is_apu = bool(re.match(r"gfx11[05]\d", gfx))
+        else:
+            e.gpus.append(GPU(
+                name=marketing or "AMD GPU", gfx_target=gfx,
+                is_amd=True, is_apu=bool(re.match(r"gfx11[05]\d", gfx)),
+            ))
+
+
+def _summarise_gpu_categories(e: Examination) -> None:
+    e.has_amd_gpu = any(g.is_amd for g in e.gpus)
+    e.has_apu = any(g.is_amd and g.is_apu for g in e.gpus)
+    e.has_discrete_amd = any(g.is_amd and g.is_apu is False for g in e.gpus)
+
+
+# ---------------------------------------------------------------------------
+# Kernel module / device probes
+# ---------------------------------------------------------------------------
+
+def _probe_modules(e: Examination) -> None:
+    if e.os_family != "linux":
+        return
+    rc, out, _ = _run(["lsmod"], timeout=5)
+    if rc == 0:
+        modules = {line.split()[0] for line in out.splitlines()[1:] if line.split()}
+        e.amdgpu_loaded = "amdgpu" in modules
+        e.amdkfd_loaded = "amdkfd" in modules
+    else:
+        # /proc/modules is always readable and is the source of truth for lsmod.
+        txt = _read_text("/proc/modules")
+        if txt:
+            modules = {line.split()[0] for line in txt.splitlines() if line.split()}
+            e.amdgpu_loaded = "amdgpu" in modules
+            e.amdkfd_loaded = "amdkfd" in modules
+
+    # Blacklist files. We don't try to parse every modprobe.d directive
+    # perfectly; we just flag any file that contains a literal "blacklist
+    # amdgpu" line so the agent can ask the user to inspect it.
+    for d in ("/etc/modprobe.d", "/usr/lib/modprobe.d", "/run/modprobe.d"):
+        try:
+            for f in Path(d).glob("*.conf"):
+                body = _read_text(str(f))
+                if re.search(r"^\s*blacklist\s+amdgpu\b", body, re.MULTILINE):
+                    e.amdgpu_blacklisted_in.append(str(f))
+        except OSError:
+            continue
+
+
+def _probe_devices(e: Examination) -> None:
+    if e.os_family != "linux":
+        return
+    e.kfd = _stat_device("/dev/kfd")
+    try:
+        for path in sorted(Path("/dev/dri").glob("renderD*")):
+            e.render_devices.append(_stat_device(str(path)))
+    except OSError:
+        pass
+
+
+def _stat_device(path: str) -> Device:
+    d = Device(path=path, exists=os.path.exists(path))
+    if not d.exists:
+        return d
+    try:
+        st = os.stat(path)
+    except OSError as exc:
+        d.mode = f"stat failed: {exc}"
+        return d
+    d.mode = stat.filemode(st.st_mode)
+    # Resolve uid/gid to names via /etc/passwd & /etc/group; we do this by
+    # hand because the pwd / grp modules are unavailable inside `uv run`
+    # sandboxes on some systems.
+    d.owner_user = _uid_to_name(st.st_uid)
+    d.owner_group = _gid_to_name(st.st_gid)
+    d.user_can_read = os.access(path, os.R_OK)
+    d.user_can_write = os.access(path, os.W_OK)
+    return d
+
+
+def _uid_to_name(uid: int) -> str:
+    try:
+        import pwd
+        return pwd.getpwuid(uid).pw_name
+    except (KeyError, ImportError, OSError):
+        return str(uid)
+
+
+def _gid_to_name(gid: int) -> str:
+    try:
+        import grp
+        return grp.getgrgid(gid).gr_name
+    except (KeyError, ImportError, OSError):
+        return str(gid)
+
+
+def _probe_user(e: Examination) -> None:
+    if e.os_family != "linux":
+        return
+    e.user_name = os.environ.get("USER") or os.environ.get("LOGNAME") or ""
+    rc, out, _ = _run(["id", "-Gn"], timeout=3)
+    if rc == 0:
+        e.user_groups = out.strip().split()
+    else:
+        # Fallback: scan /etc/group for our uid.
+        try:
+            import grp
+            uid = os.getuid()
+            import pwd
+            primary_gid = pwd.getpwuid(uid).pw_gid
+            e.user_groups = [
+                g.gr_name for g in grp.getgrall()
+                if e.user_name in g.gr_mem or g.gr_gid == primary_gid
+            ]
+        except (ImportError, KeyError, OSError):
+            pass
+    e.in_render_group = "render" in e.user_groups
+    e.in_video_group = "video" in e.user_groups
+
+
+def _probe_secure_boot(e: Examination) -> None:
+    if e.os_family != "linux":
+        return
+    if _have("mokutil"):
+        rc, out, _ = _run(["mokutil", "--sb-state"], timeout=3)
+        if rc == 0:
+            o = out.lower()
+            if "enabled" in o:
+                e.secure_boot = "enabled"
+            elif "disabled" in o:
+                e.secure_boot = "disabled"
+
+
+# ---------------------------------------------------------------------------
+# ROCm install probes
+# ---------------------------------------------------------------------------
+
+def _probe_rocm_install(e: Examination) -> None:
+    if e.os_family != "linux":
+        return
+    # 1) Canonical install path. `/opt/rocm` is a symlink to /opt/rocm-X.Y.Z
+    # on every supported install pattern, including pip wheels that ship a
+    # bundled runtime (the wheel sets ROCM_PATH instead).
+    rocm_dir = ""
+    for candidate in ("/opt/rocm", os.environ.get("ROCM_PATH", "")):
+        if candidate and os.path.isdir(candidate):
+            rocm_dir = candidate
+            break
+    e.rocm_path = rocm_dir
+
+    # 2) Version. Modern installs put a .info/version-* file; older ones
+    # only have it inside /opt/rocm-X.Y.Z. Walk both.
+    if rocm_dir:
+        for fname in ("version", "version-utils", "version-libs"):
+            f = Path(rocm_dir) / ".info" / fname
+            if f.exists():
+                e.rocm_version = f.read_text(encoding="utf-8", errors="replace").strip()
+                break
+        if not e.rocm_version:
+            # /opt/rocm-X.Y.Z symlink target.
+            try:
+                real = os.path.realpath(rocm_dir)
+                m = re.search(r"rocm-(\d+(?:\.\d+)+)", real)
+                if m:
+                    e.rocm_version = m.group(1)
+            except OSError:
+                pass
+
+    # 3) Install method. We check in priority order: amdgpu-install repo
+    # files, packaged ROCm on the distro repos, and finally "looks like a
+    # pip wheel" if /opt/rocm doesn't exist but a torch wheel bundles HIP.
+    for marker in AMDGPU_INSTALL_MARKERS:
+        if os.path.exists(marker):
+            e.rocm_install_method = "amdgpu-install"
+            e.rocm_repos_seen.append(marker)
+
+    if not e.rocm_install_method:
+        if _have("dpkg"):
+            rc, out, _ = _run(["dpkg", "-l", "rocm-hip-runtime"], timeout=8)
+            if rc == 0 and "rocm-hip-runtime" in out:
+                e.rocm_install_method = "apt"
+        if not e.rocm_install_method and _have("rpm"):
+            rc, out, _ = _run(["rpm", "-q", "rocm-hip-runtime"], timeout=8)
+            if rc == 0 and "rocm-hip-runtime" in out:
+                e.rocm_install_method = "dnf"
+    if not e.rocm_install_method:
+        if rocm_dir:
+            e.rocm_install_method = "tarball-or-other"
+        else:
+            e.rocm_install_method = "none"
+
+    # 4) Stale repo detection: more than one ROCm repo file at the same
+    # time. Common after `amdgpu-install` reruns with different `--rocmrelease`.
+    extra = []
+    try:
+        for d in ("/etc/apt/sources.list.d", "/etc/yum.repos.d"):
+            for f in Path(d).glob("*"):
+                if re.search(r"(rocm|amdgpu|radeon)", f.name, re.IGNORECASE):
+                    extra.append(str(f))
+    except OSError:
+        pass
+    # Deduplicate while preserving order.
+    for x in extra:
+        if x not in e.rocm_repos_seen:
+            e.rocm_repos_seen.append(x)
+
+
+# ---------------------------------------------------------------------------
+# Framework probes
+# ---------------------------------------------------------------------------
+
+# Inline Python the PyTorch probe pipes into the user's interpreter. Kept
+# tiny so it works even on Python interpreters with broken site-packages.
+_PYTORCH_PROBE = (
+    "import json,sys\n"
+    "out={'ok':False}\n"
+    "try:\n"
+    "  import torch\n"
+    "  out['ok']=True\n"
+    "  out['version']=torch.__version__\n"
+    "  out['hip']=getattr(torch.version,'hip',None)\n"
+    "  out['cuda']=getattr(torch.version,'cuda',None)\n"
+    "  out['is_available']=bool(torch.cuda.is_available())\n"
+    "  try: out['device_count']=int(torch.cuda.device_count())\n"
+    "  except Exception: out['device_count']=0\n"
+    "  try: out['arch_list']=list(torch.cuda.get_arch_list())\n"
+    "  except Exception: out['arch_list']=[]\n"
+    "except Exception as ex:\n"
+    "  out['error']=type(ex).__name__+': '+str(ex)\n"
+    "sys.stdout.write(json.dumps(out))\n"
+)
+
+
+def _probe_pytorch(e: Examination) -> None:
+    """Try to introspect PyTorch in the user's default python."""
+    py = sys.executable or shutil.which("python") or shutil.which("python3")
+    if not py:
+        e.framework_notes.append("No python interpreter found to probe torch.")
+        return
+    rc, out, err = _run([py, "-c", _PYTORCH_PROBE], timeout=20)
+    if rc != 0 or not out.strip():
+        # Try `python3` explicitly in case `sys.executable` is uv's own env
+        # and the user's torch lives elsewhere.
+        py2 = shutil.which("python3")
+        if py2 and py2 != py:
+            rc, out, err = _run([py2, "-c", _PYTORCH_PROBE], timeout=20)
+    if not out.strip():
+        e.framework_notes.append(
+            "Could not import torch; if PyTorch is in a venv, activate it "
+            "and re-run examine.py inside that venv."
+        )
+        if err:
+            e.framework_notes.append(f"python stderr: {err.strip().splitlines()[-1][:200]}")
+        return
+    try:
+        data = json.loads(out.strip())
+    except json.JSONDecodeError:
+        e.framework_notes.append(f"torch probe returned non-JSON: {out[:200]}")
+        return
+    if not data.get("ok"):
+        e.framework_notes.append(f"torch import failed: {data.get('error', 'unknown')}")
+        return
+    e.framework = "pytorch"
+    e.framework_version = data.get("version", "")
+    hip = data.get("hip")
+    cuda = data.get("cuda")
+    if hip:
+        e.framework_rocm_version = f"hip={hip}"
+    elif cuda:
+        e.framework_rocm_version = f"cuda={cuda}"
+        e.framework_notes.append(
+            "This torch wheel is a CUDA build, not a ROCm build. Reinstall "
+            "from the ROCm wheel index."
+        )
+    arch = data.get("arch_list") or []
+    e.framework_arch_list = [a for a in arch if isinstance(a, str)]
+    if data.get("is_available") is False:
+        e.framework_notes.append(
+            "torch.cuda.is_available() returned False -- runtime can't see a GPU."
+        )
+
+
+def _probe_llama_cpp(e: Examination) -> None:
+    """Best-effort probe of a llama.cpp build on PATH."""
+    binary = None
+    for name in ("llama-cli", "llama-server", "main"):
+        p = shutil.which(name)
+        if p:
+            binary = p
+            break
+    if not binary:
+        e.framework_notes.append("No llama.cpp binary (llama-cli/llama-server/main) on PATH.")
+        return
+    rc, out, err = _run([binary, "--version"], timeout=10)
+    body = out + err
+    if rc != 0 and not body:
+        e.framework_notes.append(f"{binary} --version exited rc={rc}")
+        return
+    e.framework = "llama-cpp"
+    e.framework_version = body.strip().splitlines()[0][:200] if body.strip() else "unknown"
+    # Newer builds print "ROCm" or "HIP" in --version when GGML_HIP=ON.
+    if "HIP" in body or "ROCm" in body or "hipBLAS" in body:
+        e.framework_rocm_version = "GGML_HIP=ON"
+    else:
+        e.framework_notes.append(
+            "llama.cpp binary doesn't advertise HIP/ROCm support; was it built "
+            "with `cmake -DGGML_HIP=ON -DAMDGPU_TARGETS=<gfx>`?"
+        )
+
+
+def _probe_framework(e: Examination, requested: str | None) -> None:
+    if requested == "skip":
+        e.framework = "skipped"
+        return
+    if requested == "pytorch":
+        _probe_pytorch(e)
+        return
+    if requested == "llama-cpp":
+        _probe_llama_cpp(e)
+        return
+    # Auto-detect: prefer PyTorch (the common case for the doctor), then
+    # llama.cpp. We don't probe both to keep the script fast and to avoid
+    # spawning a python interpreter when the user clearly meant llama.cpp.
+    py = sys.executable or shutil.which("python") or shutil.which("python3")
+    if py:
+        _probe_pytorch(e)
+        if e.framework == "pytorch":
+            return
+    _probe_llama_cpp(e)
+
+
+# ---------------------------------------------------------------------------
+# Misc probes
+# ---------------------------------------------------------------------------
+
+def _probe_env(e: Examination) -> None:
+    for k in TRACKED_ENV_VARS:
+        v = os.environ.get(k)
+        if v is None:
+            continue
+        # Truncate enormous PATHs so JSON output stays human-scale.
+        if k in ("PATH", "LD_LIBRARY_PATH") and len(v) > 4000:
+            v = v[:4000] + "...[truncated]"
+        e.env[k] = v
+    # Quick check: does any path in LD_LIBRARY_PATH carry a libamdhip64?
+    ld = os.environ.get("LD_LIBRARY_PATH", "")
+    hits = []
+    for d in ld.split(os.pathsep):
+        if not d:
+            continue
+        try:
+            for hit in Path(d).glob("libamdhip64*"):
+                hits.append(str(hit))
+                break
+        except OSError:
+            continue
+    if hits:
+        e.hip_libs_on_ld_path = True
+        e.notes.append(f"libamdhip64 visible via LD_LIBRARY_PATH: {hits[0]}")
+    else:
+        e.hip_libs_on_ld_path = False if ld else None
+
+
+def _probe_container(e: Examination) -> None:
+    for marker, kind in CONTAINER_MARKERS.items():
+        if os.path.exists(marker):
+            e.in_container = True
+            e.container_kind = kind
+            return
+    cg = _read_text("/proc/1/cgroup")
+    if cg and any(x in cg for x in ("docker", "containerd", "lxc", "kubepods", "podman")):
+        e.in_container = True
+        e.container_kind = e.container_kind or "container"
+
+
+def _probe_dmesg_amdgpu(e: Examination) -> None:
+    if e.os_family != "linux":
+        return
+    # We try journalctl first because it works for unprivileged users when
+    # the systemd journal is world-readable; dmesg is usually root-only on
+    # modern kernels (`kernel.dmesg_restrict=1`).
+    text = ""
+    rc, out, _ = _run(["journalctl", "-k", "--no-pager", "-n", "400"], timeout=8)
+    if rc == 0 and out:
+        text = out
+    else:
+        rc, out, _ = _run(["dmesg"], timeout=5)
+        if rc == 0:
+            text = out
+    if not text:
+        return
+    # Keep at most ~15 amdgpu/amdkfd lines as evidence so the JSON stays
+    # small. We prioritise lines containing well-known failure substrings.
+    interesting = (
+        "page fault", "RAS Controller", "vm_fault", "amdgpu_device_init",
+        "OUT_OF_REGISTERS", "ring", "GPU reset", "PSP", "HW_FAULT",
+    )
+    hits: list[str] = []
+    for line in text.splitlines():
+        if "amdgpu" not in line and "amdkfd" not in line:
+            continue
+        if any(s.lower() in line.lower() for s in interesting):
+            hits.append(line.strip()[:300])
+    e.dmesg_amdgpu_tail = hits[-15:]
+
+
+# ---------------------------------------------------------------------------
+# Windows-specific probes
+#
+# Windows has no equivalent of /sys, /proc, lsmod, lspci, or rocminfo. Almost
+# everything we need is reachable through PowerShell + CIM (Win32_*) and a
+# couple of well-known install directories. The HIP SDK ships hipInfo.exe,
+# which is the rocminfo analog. The kernel-mode GPU driver is part of the
+# AMD Adrenalin install and reports itself via Win32_VideoController.
+# ---------------------------------------------------------------------------
+
+def _probe_gpus_windows(e: Examination) -> None:
+    """Enumerate AMD/NVIDIA display adapters via Win32_VideoController."""
+    rc, out, _ = _run([
+        "powershell", "-NoProfile", "-Command",
+        "Get-CimInstance Win32_VideoController | "
+        "Select-Object -Property Name,PNPDeviceID,DriverVersion | "
+        "ConvertTo-Json -Compress",
+    ], timeout=10)
+    if rc != 0 or not out.strip():
+        e.probe_failures.append(
+            "Get-CimInstance Win32_VideoController failed; cannot enumerate GPUs."
+        )
+        return
+    try:
+        data = json.loads(out)
+    except json.JSONDecodeError:
+        e.probe_failures.append("Win32_VideoController returned non-JSON output.")
+        return
+    if isinstance(data, dict):
+        data = [data]
+    for entry in data:
+        if not isinstance(entry, dict):
+            continue
+        name = (entry.get("Name") or "").strip()
+        pnp = (entry.get("PNPDeviceID") or "").strip()
+        is_amd = "VEN_1002" in pnp.upper() or "AMD" in name.upper() or "RADEON" in name.upper()
+        is_nvidia = "VEN_10DE" in pnp.upper() or "NVIDIA" in name.upper()
+        if is_nvidia:
+            e.has_nvidia_gpu = True
+            e.gpus.append(GPU(name=name, pci_id=pnp, is_amd=False, is_apu=False))
+            continue
+        if not is_amd:
+            continue
+        gfx_guess, is_apu_guess = _classify_amd_marketing_name(name)
+        e.gpus.append(GPU(
+            name=name, gfx_target=gfx_guess, pci_id=pnp,
+            is_apu=is_apu_guess, is_amd=True,
+        ))
+
+
+def _probe_hip_sdk_windows(e: Examination) -> None:
+    """Locate the HIP SDK install and run hipInfo for ground-truth gfx target.
+
+    The HIP SDK installer drops files under `C:\\Program Files\\AMD\\ROCm\\<ver>\\`
+    by default and sets `HIP_PATH` (and `HIP_PATH_<ver>`) in the user/machine
+    environment. Multiple SDKs can coexist; we prefer `HIP_PATH` when set
+    because that's what loaders actually pick up.
+    """
+    candidates: list[Path] = []
+    hp = os.environ.get("HIP_PATH")
+    if hp and Path(hp).is_dir():
+        candidates.append(Path(hp))
+    for root in (r"C:\Program Files\AMD\ROCm", r"C:\Program Files (x86)\AMD\ROCm"):
+        try:
+            base = Path(root)
+            if base.is_dir():
+                for child in sorted(base.iterdir(), reverse=True):
+                    if child.is_dir() and re.match(r"\d+(\.\d+)+", child.name):
+                        candidates.append(child)
+        except OSError:
+            continue
+    seen: set[str] = set()
+    chosen: Path | None = None
+    for c in candidates:
+        s = str(c)
+        if s in seen:
+            continue
+        seen.add(s)
+        if chosen is None:
+            chosen = c
+    if chosen is None:
+        return
+    e.hip_sdk_path = str(chosen)
+    m = re.search(r"(\d+(?:\.\d+)+)$", chosen.name)
+    if m:
+        e.hip_sdk_version = m.group(1)
+
+    hipinfo = chosen / "bin" / "hipInfo.exe"
+    if not hipinfo.exists():
+        e.hipinfo_present = False
+        e.hipinfo_status = "missing"
+        return
+    e.hipinfo_present = True
+    rc, out, err = _run([str(hipinfo)], timeout=15)
+    if rc != 0:
+        merged = (out + "\n" + err).lower()
+        if "no rocm" in merged or "no devices" in merged:
+            e.hipinfo_status = "not-loaded"
+        else:
+            e.hipinfo_status = f"error rc={rc}"
+        return
+    e.hipinfo_status = "ok"
+
+    # hipInfo prints a `device# 0` block with `Name:` (gfx target) and
+    # `gcnArchName:` lines. Parse the first GPU device for ground truth.
+    gfx = ""
+    name = ""
+    for line in out.splitlines():
+        s = line.strip()
+        if s.startswith("Name:") and not name:
+            name = s.split(":", 1)[1].strip()
+        if s.startswith("gcnArchName:") and not gfx:
+            val = s.split(":", 1)[1].strip()
+            if val.startswith("gfx"):
+                gfx = val.split(":")[0]
+        if s.startswith("arch:") and not gfx:
+            val = s.split(":", 1)[1].strip()
+            if val.startswith("gfx"):
+                gfx = val
+        if gfx and name:
+            break
+    amd_entries = [g for g in e.gpus if g.is_amd]
+    if gfx and amd_entries:
+        if not amd_entries[0].gfx_target:
+            amd_entries[0].gfx_target = gfx
+            amd_entries[0].is_apu = bool(re.match(r"gfx11[05]\d", gfx))
+        if name and not amd_entries[0].name:
+            amd_entries[0].name = name
+
+
+def _probe_adrenalin_windows(e: Examination) -> None:
+    """Best-effort probe of the AMD Adrenalin / kernel-mode driver version."""
+    rc, out, _ = _run([
+        "powershell", "-NoProfile", "-Command",
+        "(Get-CimInstance Win32_VideoController | "
+        "Where-Object { $_.Name -like '*AMD*' -or $_.Name -like '*Radeon*' } | "
+        "Select-Object -First 1).DriverVersion",
+    ], timeout=8)
+    if rc == 0 and out.strip():
+        e.adrenalin_version = out.strip().splitlines()[0].strip()
+
+
+def _probe_msvc_redist_windows(e: Examination) -> None:
+    """Check whether `vcruntime140.dll` and `vcruntime140_1.dll` are loadable.
+
+    The HIP SDK's amdhip64_*.dll links against the MSVC 2015-2022 runtime;
+    when the redistributable isn't installed, `import torch` fails with a
+    DLL-load error that points at vcruntime140_1.dll.
+    """
+    paths = os.environ.get("PATH", "").split(os.pathsep)
+    sysroot = os.environ.get("SystemRoot") or r"C:\Windows"
+    paths.extend([
+        os.path.join(sysroot, "System32"),
+        os.path.join(sysroot, "SysWOW64"),
+    ])
+    have_140 = False
+    have_140_1 = False
+    for d in paths:
+        if not d:
+            continue
+        try:
+            p = Path(d)
+            if not p.is_dir():
+                continue
+            if (p / "vcruntime140.dll").exists():
+                have_140 = True
+            if (p / "vcruntime140_1.dll").exists():
+                have_140_1 = True
+        except OSError:
+            continue
+        if have_140 and have_140_1:
+            break
+    e.msvc_redist_present = have_140 and have_140_1
+
+
+def _probe_env_windows(e: Examination) -> None:
+    """Capture the env vars the diagnosis catalog reads on Windows.
+
+    Mirrors `_probe_env` for Linux but skips the LD_LIBRARY_PATH scan
+    (Windows uses the PATH-based DLL search instead).
+    """
+    for k in TRACKED_ENV_VARS:
+        v = os.environ.get(k)
+        if v is None:
+            continue
+        if k == "PATH" and len(v) > 4000:
+            v = v[:4000] + "...[truncated]"
+        e.env[k] = v
+
+
+# ---------------------------------------------------------------------------
+# Orchestration
+# ---------------------------------------------------------------------------
+
+def examine(requested_framework: str | None) -> Examination:
+    e = Examination()
+    _probe_os(e)
+    if e.is_wsl:
+        # WSL is a real, common environment but the failure modes there
+        # (Adrenalin Pro on the Windows host, the WSL kernel update, the
+        # /usr/lib/wsl/lib loader handoff) are NOT in this catalog. Refuse
+        # explicitly rather than running Linux-native probes that would all
+        # mislead.
+        e.notes.append(
+            "Detected WSL2. rocm-doctor does not cover the ROCm-on-WSL flow "
+            "(it requires Adrenalin Pro + the WSL kernel update on the "
+            "Windows host). Either run this script on the native Linux "
+            "host, or follow AMD's WSL guide directly: "
+            "https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/install/installryz/wsl/howto_wsl.html"
+        )
+        return e
+    if e.os_family == "linux":
+        _probe_cpu(e)
+        _probe_gpus_lspci(e)
+        _probe_gpus_rocminfo(e)
+        _summarise_gpu_categories(e)
+        _probe_modules(e)
+        _probe_devices(e)
+        _probe_user(e)
+        _probe_secure_boot(e)
+        _probe_rocm_install(e)
+        _probe_env(e)
+        _probe_container(e)
+        _probe_dmesg_amdgpu(e)
+        _probe_framework(e, requested_framework)
+        return e
+    if e.os_family == "windows":
+        _probe_cpu(e)
+        _probe_gpus_windows(e)
+        _probe_hip_sdk_windows(e)
+        _probe_adrenalin_windows(e)
+        _probe_msvc_redist_windows(e)
+        _summarise_gpu_categories(e)
+        _probe_env_windows(e)
+        _probe_framework(e, requested_framework)
+        return e
+    e.notes.append(
+        f"rocm-doctor supports Linux and Windows; got {e.os_family}. "
+        "This skill cannot help on this platform."
+    )
+    return e
+
+
+# ---------------------------------------------------------------------------
+# Human-readable rendering
+# ---------------------------------------------------------------------------
+
+def _fmt_yesno(v: bool | None) -> str:
+    return "unknown" if v is None else ("yes" if v else "no")
+
+
+def _print_gpus(e: Examination) -> None:
+    print("\nGPUs:")
+    if not e.gpus:
+        if e.os_family == "linux":
+            print("  (none detected; lspci returned no VGA/3D/Display controllers)")
+        else:
+            print("  (none detected; Win32_VideoController returned no AMD/NVIDIA adapters)")
+    for g in e.gpus:
+        flag = ""
+        if g.is_amd and g.is_apu:
+            flag = " [AMD APU]"
+        elif g.is_amd:
+            flag = " [AMD dGPU]"
+        elif "NVIDIA" in g.name.upper():
+            flag = " [NVIDIA]"
+        print(f"  - {g.pci_id}  {g.name or 'unknown'}  gfx={g.gfx_target or '?'}{flag}")
+
+
+def _print_framework_block(e: Examination) -> None:
+    print("\nFramework:")
+    print(f"  detected:        {e.framework}")
+    if e.framework_version:
+        print(f"  version:         {e.framework_version}")
+    if e.framework_rocm_version:
+        print(f"  rocm/hip:        {e.framework_rocm_version}")
+    if e.framework_arch_list:
+        print(f"  arch list:       {' '.join(e.framework_arch_list)}")
+    for n in e.framework_notes:
+        print(f"  note: {n}")
+
+
+def _print_env_block(e: Examination) -> None:
+    if not e.env:
+        return
+    print("\nRelevant environment variables (set in current shell):")
+    for k, v in e.env.items():
+        display = v if len(v) <= 200 else (v[:200] + "...")
+        print(f"  {k}={display}")
+
+
+def _print_human_linux(e: Examination) -> None:
+    print(f"Kernel:           {e.kernel_release}")
+    if e.iommu_kernel_param:
+        print(f"  iommu=          {e.iommu_kernel_param}")
+    print(f"CPU:              {e.cpu_model} (vendor: {e.cpu_vendor})")
+    if e.secure_boot != "unknown":
+        print(f"Secure Boot:      {e.secure_boot}")
+    if e.in_container:
+        print(f"Container:        yes ({e.container_kind})")
+
+    _print_gpus(e)
+
+    print("\nDriver & devices:")
+    print(f"  amdgpu loaded:   {_fmt_yesno(e.amdgpu_loaded)}")
+    if e.amdgpu_blacklisted_in:
+        print(f"  amdgpu blacklisted in: {', '.join(e.amdgpu_blacklisted_in)}")
+    print(f"  amdkfd loaded:   {_fmt_yesno(e.amdkfd_loaded)}")
+    print(f"  rocminfo:        {e.rocminfo_status}")
+    if e.kfd:
+        print(f"  /dev/kfd:        exists={e.kfd.exists} mode={e.kfd.mode} "
+              f"owner={e.kfd.owner_user}:{e.kfd.owner_group} "
+              f"r={_fmt_yesno(e.kfd.user_can_read)} w={_fmt_yesno(e.kfd.user_can_write)}")
+    for d in e.render_devices:
+        print(f"  {d.path}:  mode={d.mode} owner={d.owner_user}:{d.owner_group} "
+              f"r={_fmt_yesno(d.user_can_read)} w={_fmt_yesno(d.user_can_write)}")
+
+    print("\nUser:")
+    print(f"  name:            {e.user_name or 'unknown'}")
+    print(f"  in render group: {_fmt_yesno(e.in_render_group)}")
+    print(f"  in video group:  {_fmt_yesno(e.in_video_group)}")
+    if e.user_groups:
+        print(f"  all groups:      {' '.join(e.user_groups)}")
+
+    print("\nROCm install:")
+    print(f"  path:            {e.rocm_path or 'not found'}")
+    print(f"  version:         {e.rocm_version or 'unknown'}")
+    print(f"  install method:  {e.rocm_install_method or 'unknown'}")
+    if e.rocm_repos_seen:
+        print(f"  repos seen:      {len(e.rocm_repos_seen)} file(s)")
+        for r in e.rocm_repos_seen:
+            print(f"    - {r}")
+
+    _print_framework_block(e)
+    _print_env_block(e)
+
+    if e.dmesg_amdgpu_tail:
+        print("\nRecent amdgpu/amdkfd kernel messages (last few interesting lines):")
+        for line in e.dmesg_amdgpu_tail:
+            print(f"  | {line}")
+
+
+def _print_human_windows(e: Examination) -> None:
+    print(f"CPU:              {e.cpu_model or 'unknown'} (vendor: {e.cpu_vendor})")
+
+    _print_gpus(e)
+
+    print("\nDriver & runtime:")
+    print(f"  Adrenalin driver: {e.adrenalin_version or 'unknown'}")
+    print(f"  hipInfo:          {e.hipinfo_status or 'missing'}")
+    print(f"  MSVC redist:      {_fmt_yesno(e.msvc_redist_present)}")
+
+    print("\nHIP SDK install:")
+    print(f"  path:            {e.hip_sdk_path or 'not found'}")
+    print(f"  version:         {e.hip_sdk_version or 'unknown'}")
+
+    _print_framework_block(e)
+    _print_env_block(e)
+
+
+def _print_human(e: Examination) -> None:
+    print("rocm-doctor -- system examination (read-only)")
+    print("=" * 60)
+    print(f"OS:               {e.os_family} {e.distro_id} {e.distro_version}".strip())
+    if e.is_wsl:
+        print("WSL:              yes (out of scope; see notes)")
+    if e.is_wsl or e.os_family not in ("linux", "windows"):
+        for n in e.notes:
+            print(f"  note: {n}")
+        return
+
+    if e.os_family == "linux":
+        _print_human_linux(e)
+    elif e.os_family == "windows":
+        _print_human_windows(e)
+
+    if e.probe_failures:
+        print("\nProbes that did not complete:")
+        for p in e.probe_failures:
+            print(f"  - {p}")
+
+    if e.notes:
+        print("\nNotes:")
+        for n in e.notes:
+            print(f"  - {n}")
+
+    print("\nNext step: feed this examination into diagnose.py:")
+    print("  python scripts/examine.py --json > exam.json")
+    print("  python scripts/diagnose.py --exam exam.json --symptom \"<paste user's error>\"")
+
+
+def _to_jsonable(e: Examination) -> dict:
+    """asdict() handles nested dataclasses; we just rename Optional[Device]."""
+    d = asdict(e)
+    return d
+
+
+def _exit_code(e: Examination) -> int:
+    if e.is_wsl:
+        # WSL is detected but explicitly out of scope. Treat like "wrong
+        # platform" so the agent stops and routes the user.
+        return 2
+    if e.os_family not in ("linux", "windows"):
+        return 2
+    if not e.has_amd_gpu:
+        # NVIDIA-only or no GPU at all -- this skill can't help.
+        return 2
+    # Probes that didn't complete are a soft warning, not a hard fail.
+    if e.os_family == "linux":
+        if e.probe_failures and not e.rocminfo_present and not e.gpus:
+            return 3
+    else:  # windows
+        if e.probe_failures and not e.hipinfo_present and not e.gpus:
+            return 3
+    return 0
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--json", action="store_true",
+                        help="Emit machine-readable JSON for diagnose.py.")
+    parser.add_argument(
+        "--framework",
+        choices=["pytorch", "llama-cpp", "skip", "auto"],
+        default="auto",
+        help="Which framework probe to run (default: auto-detect).",
+    )
+    args = parser.parse_args(argv)
+
+    requested = None if args.framework == "auto" else args.framework
+    e = examine(requested)
+    if args.json:
+        print(json.dumps(_to_jsonable(e), indent=2))
+    else:
+        _print_human(e)
+    return _exit_code(e)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())