diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 18b62b1..e06a6ab 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -25,6 +25,12 @@ "source": "./skills/local-ai-use", "skills": "./", "description": "Route image generation, text-to-speech, and speech-to-text through a local AI Server to reduce token/cost usage." + }, + { + "name": "rocm-doctor", + "source": "./skills/rocm-doctor", + "skills": "./", + "description": "Diagnose why ROCm, PyTorch, or llama.cpp isn't working on an AMD GPU. Matches the symptom against a fixed list of twelve known misconfigurations and proposes the next step." } ] } diff --git a/README.md b/README.md index 3c06e79..c43de71 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for step-by-step instructions, the full a ## Status -This repository is in its early days. In-repo skills include `skills/local-ai-app-integration/` and `skills/local-ai-use/`, seeding the **Application integration** focus area, and `skills/apu-memory-tuner/`, seeding the **Hardware-native** focus area. The remaining skills are being built out incrementally alongside manifests and CI. Expect rapid iteration. +This repository is in its early days. In-repo skills include `skills/local-ai-app-integration/` and `skills/local-ai-use/`, seeding the **Application integration** focus area, and `skills/apu-memory-tuner/` and `skills/rocm-doctor/`, seeding the **Hardware-native** focus area. The remaining skills are being built out incrementally alongside manifests and CI. Expect rapid iteration. ## License diff --git a/skills/rocm-doctor/SKILL.md b/skills/rocm-doctor/SKILL.md new file mode 100644 index 0000000..f27ac1f --- /dev/null +++ b/skills/rocm-doctor/SKILL.md @@ -0,0 +1,285 @@ +--- +name: rocm-doctor +description: >- + Diagnoses why ROCm, the HIP SDK, PyTorch, or llama.cpp is broken on an + AMD GPU on Linux or Windows, and either applies a low-risk fix with + consent or hands back the exact next step. Also routes Lemonade, LM + Studio, and Ollama issues to the right upstream channel. Use when the + user reports that ROCm or HIP isn't working, torch.cuda.is_available() + is False Ryzen AI, rocminfo or hipInfo can't see the GPU, + or hits hipErrorNoBinaryForGpu, + HSA_STATUS_ERROR_INVALID_ISA, invalid device function, missing + amdhip64_6.dll, vcruntime140_1.dll, or libamdhip64.so, cannot open + /dev/kfd, ROCk module not loaded, an Adrenalin driver too old for the + HIP SDK, or a ROCm wheel that doesn't recognize gfx1151, gfx1150, or + gfx1103; or mentions HSA_OVERRIDE_GFX_VERSION, + HIP_VISIBLE_DEVICES, PYTORCH_ROCM_ARCH, render-group permissions, + amdgpu blacklist, Secure Boot, iGPU/dGPU collisions, or multi-GPU + hangs. Do not use for non-AMD GPUs, performance + tuning, or ROCm-on-WSL2. +--- + +# ROCm Doctor + +Given a "ROCm/PyTorch/llama.cpp isn't working on my AMD GPU" complaint, +identify which **known misconfiguration** is the cause and either fix it +or hand back the exact next step. + +This is a diagnose-and-fix skill, not a setup or tuning skill. The +catalog of failure modes is a **closed list** that lives in +`reference.md` and `scripts/diagnose.py`: if the user's symptom doesn't +match one of them, the skill explicitly routes upstream rather than +guessing. New failure modes get added by editing the catalog, not by +the agent inventing them at runtime. + +## When to use this skill + +Use it when **any** of the following are true: + +- The user has an **AMD** GPU and a functional error with **PyTorch**, + **llama.cpp**, or anything else built directly against the system ROCm + (`/opt/rocm` or a pip wheel that bundles HIP). The skill examines the + host and diagnoses against the catalog. +- The user is on **Lemonade**, **LM Studio**, or **Ollama**. These apps + ship their own ROCm and don't need a host-level examination, but the + user often doesn't know *where* to report the problem -- the skill + knows the right upstream channel for each (see + [Framework routing](#framework-routing)) and hands it over. + +Out of scope: + +- NVIDIA / Intel / Apple Silicon GPUs. Exit cleanly and tell the user. +- Fresh installs on a clean machine. That's a setup task; point at + [`amdgpu-install`](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-overview.html) + (Linux) or the [HIP SDK installer](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html) + (Windows). +- Pure performance complaints. Those belong in `mi-tuner` / + `omniperf-tune` / `apu-memory-tuner`. +- **WSL2** (running Linux on top of Windows). The ROCm-on-WSL flow needs + Adrenalin Pro plus the WSL kernel update on the Windows host -- those + failure modes are not in this catalog. `examine.py` detects WSL via + `/proc/version` and exits 2 with a route-out message; if the user wants + WSL specifically, point them at . + +## Prerequisites + +- **OS:** Linux **or** Windows (native). The catalog has 12 Linux entries + (5 of which are also valid on Windows) and 3 Windows-only entries; the + scripts pick the right subset for the host they run on. +- **Linux tools the agent will invoke as part of examination** (best-effort; + the script degrades when one is missing): + - `lspci` (always present on desktop distros) + - `rocminfo` (when ROCm is installed) + - `journalctl` or `dmesg` (for amdgpu kernel-ring evidence) + - `python` / `python3` to introspect PyTorch + - `llama-cli` / `llama-server` / `main` to introspect llama.cpp +- **Windows tools the agent will invoke as part of examination**: + - `powershell` (always present on Windows 10+) for `Get-CimInstance + Win32_VideoController` / `Win32_Processor` and the env-scope reads. + - `hipInfo.exe` from `%HIP_PATH%\bin` -- the Windows analog of `rocminfo`. + Absence is itself a signal (see `fix-13-hip-sdk-missing`). + - `setx` for env-var persistence and User-PATH edits (analog of editing + `~/.bashrc` on Linux). + - `python` to introspect PyTorch. +- **Permissions:** examination is fully read-only and works as a regular + user on both OSes. Linux fixes that need `sudo` are flagged in their + recipe metadata; Windows fixes that touch the Machine env scope are + flagged similarly and `apply_fix.py` does NOT self-elevate -- the user + has to run an Administrator PowerShell when those are required. + +Silent footguns to surface explicitly when relevant: + +- `HSA_OVERRIDE_GFX_VERSION` -- forcing an unsupported gfx target works + for `rocminfo`/`hipInfo` but causes page faults at runtime. Diagnosis + `fix-2-unset-override` is the response when this is set on a GPU that + already has a native wheel; on Windows it can be persisted in either + the User or Machine env scope, so check both. +- `HIP_VISIBLE_DEVICES` -- on dual-GPU systems (APU + dGPU) the iGPU is + often index 0 and destabilises HIP unless explicitly hidden. +- `HIP_PATH` (Windows) -- if the user has multiple HIP SDK versions + installed under `C:\Program Files\AMD\ROCm\`, `HIP_PATH` decides which + one PyTorch / hipInfo actually loads. Pointing it at the wrong major + produces the same failure mode as `fix-8-wheel-rocm`. +- `PYTORCH_ROCM_ARCH` -- only honored during a *build* of PyTorch. Setting + it at runtime does nothing for a prebuilt wheel. +- `LD_LIBRARY_PATH` (Linux) -- a wheel-bundled `libamdhip64.so` shadowed + by a system one (or vice versa) gives confusing `cannot open shared + object file` errors that look like fix-8 but are really a load-order + bug. The Windows analog is `PATH` order: a stale HIP SDK bin directory + earlier on PATH than the one matching `HIP_PATH`. + +## The three-step flow + +Run these in order. The first two are read-only. The third asks before +changing anything. + +``` +[ ] 1. Identify the framework, then examine (read-only). +[ ] 2. Diagnose: match examination + symptom against the catalog. +[ ] 3. Propose the fix; only apply with explicit consent; re-verify. +``` + +### Step 1: identify the framework and examine + +If the user hasn't said, ask which framework they are running. Use the +`AskQuestion` tool with PyTorch / llama.cpp / Lemonade / LM Studio / +Ollama / other as the options. The routing in [Framework routing](#framework-routing) +keys off the answer. + +If the framework is in the "skip examination" bucket, jump straight to +the upstream link and exit. Otherwise run: + +```bash +python scripts/examine.py --framework pytorch --json > exam.json +``` + +Replace `pytorch` with `llama-cpp`, or pass `--framework auto` to let the +script pick. Exit codes: + +| Exit | Meaning | Next action | +|---|---|---| +| 0 | Examined; AMD GPU present | Continue to Step 2. | +| 2 | Wrong platform (WSL, neither Linux nor Windows, no AMD GPU) | Stop. Route the user. | +| 3 | Probes partially failed | Continue but warn the user. | + +For a quick read-only summary without piping JSON, drop `--json`: + +```bash +python scripts/examine.py --framework pytorch +``` + +`examine.py` collects exactly the facts the diagnosis catalog needs. +On Linux: OS / kernel, AMD GPUs and gfx targets, `amdgpu` / `amdkfd` +status, `/dev/kfd` ownership and group, user's group membership, system +ROCm version and install method, framework version and arch list, the +silent-footgun env vars, container/IOMMU state, and recent `amdgpu` +kernel log lines. On Windows: AMD adapters and gfx targets via +`Win32_VideoController` + `hipInfo.exe`, the HIP SDK install path and +version, the Adrenalin / kernel-mode driver version, MSVC redistributable +presence, and the same env-var snapshot. It deliberately does NOT spawn +heavy probes (no kernel launches, no model downloads). + +### Step 2: diagnose + +Hand the JSON snapshot plus the user's error text to `diagnose.py`: + +```bash +python scripts/diagnose.py --exam exam.json \ + --symptom "HIP error: invalid device function on gfx1151" +``` + +The script runs every checker in the catalog, scores each from 0..100, +and prints a ranked list. Each match has a stable `fix-N-...` id used by +`apply_fix.py`. + +Score tiers: + +- `>= 75` (`HIGH`) -- propose the fix and (if auto-applicable) ask for + consent to apply it. +- `>= 50` (`LIKELY`) -- describe the match and ask the user to confirm one + more piece of evidence before applying. +- Below `50` -- print but do **not** act. If nothing scores `>= 50`, the + script exits 1 with a single-line route to the right upstream tracker. + Do not speculate. + +JSON output (`--json`) is the same data the agent should use programmatically: + +```bash +python scripts/diagnose.py --exam exam.json --symptom "..." --json +``` + +### Step 3: apply the fix (with consent) + +Show the user the proposed fix (it's already printed by `diagnose.py`). +If they consent, run: + +```bash +python scripts/apply_fix.py --fix-id fix-4-render-group --dry-run +python scripts/apply_fix.py --fix-id fix-4-render-group --yes +``` + +`--dry-run` prints the exact commands without executing. `--yes` skips +the interactive `[y/N]` prompt (only pass this after the user has agreed +in chat). + +A subset of fixes are auto-applicable; the rest are deliberately +print-only because the risk of a half-applied state is too high for an +agent to take. To see which is which without consulting `reference.md`: + +```bash +python scripts/apply_fix.py --list +``` + +That prints every `fix-id` with an `AUTO` or `PRINT-ONLY` tag. Auto +fixes are bounded operations like unsetting an env var, adding the user +to a group, or appending a single line to a shell rc. Print-only fixes +involve reinstalling frameworks, editing GRUB, regenerating the +initramfs, or moving system repo files; those need a human at the +keyboard. + +After every fix, re-run the `verify` command the recipe printed. Only +declare success when the user's *original* failing command now succeeds +(e.g. `torch.cuda.is_available()` returns `True`, `rocminfo` lists the +GPU, the llama.cpp build runs). + +## Framework routing + +The skill's first decision is which framework the user runs. Some +frameworks ship their own ROCm and bypass the system install; for those +the right answer is "you're in the wrong place, here's where to file +it", and the skill delivers that answer directly rather than running +useless probes against the host. + +| Framework | Examine the host? | Action | +|---|---|---| +| PyTorch (Linux ROCm wheel) | Yes | `python scripts/examine.py --framework pytorch`, then `diagnose.py`. | +| PyTorch (Windows TheRock wheel) | Yes | Same scripts; on Windows the catalog filters to Linux+Windows + Windows-only entries. | +| llama.cpp (built against system ROCm/HIP SDK) | Yes | `python scripts/examine.py --framework llama-cpp`, then `diagnose.py`. | +| Lemonade | No -- ships its own ROCm | Route to and the Lemonade [Discord](https://discord.gg/5xXzkMu8Zk). | +| LM Studio | No -- ships its own runtime | Route to (in-app support; no public repo). | +| Ollama | No -- ships its own runtime | Route to and the Ollama Discord. | +| vLLM / SGLang | Out of scope until phase 1+ | Route to the project's own issue tracker. | + +If a Lemonade / LM Studio / Ollama user *does* have a host-level ROCm +problem (rare), it shows up when their app fails AND a standalone +`rocminfo` (Linux) / `hipInfo.exe` (Windows) also fails. Only then +escalate to the full examination. + +## Safety rules + +- Read-only by default. Examination and diagnosis never change state. +- Always print before applying. `apply_fix.py` shows every command before + asking for consent, even with `--yes`. +- Never reboot, never touch BIOS, never flash firmware. +- Never reinstall system packages without an interactive prompt or `--yes`. +- Never set `HSA_OVERRIDE_GFX_VERSION` as the *first* fix when a native + wheel exists. That is `fix-2-unset-override`'s entire reason for being. +- Never silently fall back to a different fix when the requested one + isn't applicable. Exit 3 and tell the user why. +- When nothing in the catalog matches, **do not speculate**. Hand the + user the upstream tracker URL from `diagnose.py --json`. + +## Verification checklist + +Mark this skill complete only when **all** are true: + +- [ ] `python scripts/examine.py` exits 0 (or 3 with the user's explicit + go-ahead to continue despite a partial probe). +- [ ] `python scripts/diagnose.py --exam exam.json --symptom "..."` exits 0 + and surfaced exactly one HIGH-confidence diagnosis, OR it exited 1 + and the user has been routed to the right upstream tracker. +- [ ] If a fix was applied: the recipe's `verify` command exits cleanly. +- [ ] The user's *original* failing command now succeeds end-to-end (run + it again in their original shell). +- [ ] If any fix needed a re-login or reboot, the user has actually done + it before declaring success. + +If any box is unchecked, the failure isn't resolved -- say so out loud +rather than declaring victory. + +## Reference + +For the full catalog of known misconfigurations, every fix-id and its +verify command, the silent-footgun env-var reference, and the +upstream-routing table in machine-readable form, see +[reference.md](reference.md). diff --git a/skills/rocm-doctor/reference.md b/skills/rocm-doctor/reference.md new file mode 100644 index 0000000..07d4120 --- /dev/null +++ b/skills/rocm-doctor/reference.md @@ -0,0 +1,363 @@ +# ROCm Doctor -- Reference + +Detailed background for the `rocm-doctor` skill. Read this only when the +three-step flow in `SKILL.md` doesn't cover a decision. + +## Contents + +- [The known-misconfigurations catalog](#the-known-misconfigurations-catalog) +- [Silent-footgun environment variables](#silent-footgun-environment-variables) +- [Windows-specific footguns](#windows-specific-footguns) +- [Framework support matrix](#framework-support-matrix) +- [Device support, phased](#device-support-phased) +- [Live AMD compatibility matrices](#live-amd-compatibility-matrices) +- [Wheel index reference](#wheel-index-reference) +- [Upstream routing](#upstream-routing) +- [Why we do not auto-set HSA_OVERRIDE_GFX_VERSION](#why-we-do-not-auto-set-hsa_override_gfx_version) +- [Why WSL is out of scope](#why-wsl-is-out-of-scope) +- [Adjacent problem: matrices in hand-typed tables](#adjacent-problem-matrices-in-hand-typed-tables) + +--- + +## The known-misconfigurations catalog + +The closed list `diagnose.py` checks against. Each row maps to one +`fix-N-...` recipe in `apply_fix.py`. **If a user's symptom doesn't +match any of these, the skill must not speculate** -- it exits 1 and +prints the upstream tracker URL from `_route_when_no_match`. + +This catalog grows over time. To add a new failure mode: add a +`check_N_*` function to `scripts/diagnose.py`, a `FixRecipe` with the +matching `fix-id` to `scripts/apply_fix.py`'s `RECIPES`, and a row to +the table below. The decision-tree contract -- score 0..100, emit the +recipe's `verify` command on a hit, exit 1 + route upstream on a miss -- +stays the same regardless of catalog size. + +| # | fix-id | OS | Failure pattern | Typical signal | Default fix | +|---|---|---|---|---|---| +| 1 | `fix-1-arch` | both | GPU `gfx` target not in framework's compiled arch list | `hipErrorNoBinaryForGpu`, `HIP error: invalid device function`, `HSA_STATUS_ERROR_INVALID_ISA`, `torch.cuda.get_arch_list()` missing the GPU's gfx | Reinstall the framework from a wheel index that ships kernels for the GPU's gfx (TheRock per-gfx wheels are the recommended fallback, and the only first-party option on Windows AMD). | +| 2 | `fix-2-unset-override` | both | `HSA_OVERRIDE_GFX_VERSION` set on a GPU that has a native wheel | Hangs, `amdgpu: page fault` in `dmesg`, `OUT_OF_REGISTERS` from the compiler | Linux: `unset HSA_OVERRIDE_GFX_VERSION` and remove from shell rc. Windows: `setx HSA_OVERRIDE_GFX_VERSION ""`, plus check the Machine env scope. | +| 3 | `fix-3-rocm-kernel` | linux | ROCm <-> distro/kernel forms an unsupported triple | `amdgpu-install` DKMS build fails; `amdgpu` not loaded after install | Cross-check the live AMD compatibility matrix; install matching HWE kernel; consider `--no-dkms`. | +| 4 | `fix-4-render-group` | linux | User not in `render` / `video` groups, or `/dev/kfd` group is wrong | `Unable to open /dev/kfd: Operation not permitted`; `rocminfo` works under `sudo` but not as user | `sudo usermod -a -G render,video "$USER"`; log out/in. | +| 5 | `fix-5-amdgpu-load` | linux | `amdgpu` kernel module not loaded or blacklisted | `rocminfo` says "ROCk module is NOT loaded"; `lsmod \| grep amdgpu` empty; blacklist line in `/etc/modprobe.d/*` | Remove blacklist; `update-initramfs -u`; `modprobe amdgpu`; check Secure Boot. | +| 6 | `fix-6-path` | both | ROCm/HIP binaries not on `PATH` after install | `rocminfo: command not found` (Linux) or `hipInfo.exe` not in `%PATH%` (Windows) immediately after a clean install | Linux: append `/opt/rocm/bin` to `PATH` in the shell rc. Windows: `setx PATH "%PATH%;C:\Program Files\AMD\ROCm\\bin"` and reopen the shell. | +| 7 | `fix-7-stale-repos` | linux | Stale / conflicting APT or DNF repos from prior installer runs | `404` on `repo.radeon.com`, "Release file not valid", mixed-version packages | Quarantine duplicate repo files in `/etc/apt/sources.list.d/`; re-run `apt update` cleanly. | +| 8 | `fix-8-wheel-rocm` | both | Framework wheel built for a different ROCm/HIP major than the system | Linux: `libamdhip64.so.X: cannot open shared object file`. Windows: `amdhip64_X.dll could not be found` / `DLL load failed`. | Reinstall the framework from the index matching the system ROCm/HIP SDK major (or upgrade the system to match). | +| 9 | `fix-9-igpu-dgpu` | both | iGPU enumerated alongside dGPU and destabilising the runtime | Random crashes / segfaults on systems with both an APU and a dGPU | Linux: `export HIP_VISIBLE_DEVICES=`. Windows: `setx HIP_VISIBLE_DEVICES ` and reopen the shell. | +| 10 | `fix-10-container` | linux | Container can't see `/dev/kfd` or `/dev/dri/renderD*` | `rocminfo` inside container fails with permission denied; host works | Re-launch with `--device=/dev/kfd --device=/dev/dri --group-add render`; on rootless podman also `--userns=keep-id`. | +| 11 | `fix-11-iommu` | linux | Multi-GPU hang when IOMMU is in default 'on' mode | First multi-GPU job hangs indefinitely | Add `iommu=pt` to the kernel cmdline; reboot. | +| 12 | `fix-12-installer` | linux | `amdgpu-install` left a half-configured state | Subsequent `apt update` errors; `dpkg` complains about half-configured packages; `--accept-eula` repo regression | Run the documented uninstall sequence, then reinstall without the offending flag. | +| 13 | `fix-13-hip-sdk-missing` | windows | Framework links HIP but the HIP SDK isn't installed on this host | `amdhip64_X.dll not found`, `Could not find HIP`, `hipInfo` is not a command, `HIP_PATH` unset | Install the AMD HIP SDK matched to the framework's HIP major: | +| 14 | `fix-14-adrenalin-too-old` | windows | Adrenalin / kernel-mode driver older than the HIP SDK pairs with | HIP SDK installed but `hipInfo.exe` reports no agents; `driver too old` style errors | Update Adrenalin from ; cross-check the SDK release notes for the exact pairing; reboot. | +| 15 | `fix-15-msvc-redist` | windows | MSVC 2015-2022 runtime DLL missing -- HIP DLLs cannot load | `vcruntime140.dll` / `vcruntime140_1.dll` missing dialog; `api-ms-win-crt-*.dll` errors | Install the VC++ redistributable: . | + +For the exact heuristics each checker uses (state signals vs. symptom +keyword weights), see the per-function comments in `scripts/diagnose.py`. + +## Silent-footgun environment variables + +These four change ROCm/HIP behaviour without printing a warning. Each one +gets a named callout in this section because they account for a +disproportionate share of "ROCm doesn't work" reports. + +### `HSA_OVERRIDE_GFX_VERSION` + +Tells HSA to advertise a different `gfx` target to user-space than the +kernel actually has. Useful in exactly one situation: when no +framework wheel ships kernels for your real gfx and a close-enough gfx +exists. Outside that case it causes page faults at runtime because the +compiler emits ISA for the override target but the hardware executes a +different ISA. + +The doctor's default response when this variable is set on a GPU that +*does* have a native wheel is `fix-2-unset-override`, which: + +1. Tells the user the variable is set. +2. Suggests `unset HSA_OVERRIDE_GFX_VERSION`. +3. Greps the user's shell rc files for persistent exports and points + them at the lines to delete. + +It deliberately does not edit the user's dotfiles. Editing someone +else's `~/.bashrc` is too easy to get wrong and too easy to forget you +did. + +### `HIP_VISIBLE_DEVICES` / `ROCR_VISIBLE_DEVICES` + +The HIP / HSA equivalents of `CUDA_VISIBLE_DEVICES`. They restrict which +agents the runtime enumerates, by integer index in `rocminfo` order. +Setting either to `0,1` does not change anything on a single-GPU box but +matters on dual-GPU boxes (APU + dGPU, or two dGPUs). + +The doctor uses `HIP_VISIBLE_DEVICES` (not `ROCR_VISIBLE_DEVICES`) +because both ROCm and PyTorch honour it; PyTorch also honours +`CUDA_VISIBLE_DEVICES` as an alias on HIP builds, which surprises +users who set both to different values. If both are set, the agent +should ask the user to pick one and unset the other. + +### `PYTORCH_ROCM_ARCH` + +A **build-time** variable, not a runtime one. Used when compiling +PyTorch from source to select which `gfx` targets the wheel will ship +kernels for. Setting it at runtime against a prebuilt wheel does +nothing; the wheel's arch list was baked at build time. + +The agent should treat `PYTORCH_ROCM_ARCH` in a user's runtime shell as +a tell that the user has been pasting recipes from the wrong tutorial. +It is not a fix; it is misinformation. + +### `LD_LIBRARY_PATH` + +Frameworks that bundle their own HIP (most pip wheels) ship a private +`libamdhip64.so.X`. If the user has `LD_LIBRARY_PATH` pointing at a +system `/opt/rocm/lib` that contains a different major version, the +loader may pick the wrong one and the import fails with `cannot open +shared object file` or `version 'X' not found`. This LOOKS like +`fix-8-wheel-rocm` (wheel/ROCm major mismatch) but the underlying cause +is a load-order conflict. + +If `examine.py` reports `hip_libs_on_ld_path=true` and the framework +also bundles HIP, suggest unsetting `LD_LIBRARY_PATH` and re-running the +import before reinstalling anything. + +## Windows-specific footguns + +Windows uses different mechanisms for the same failure modes Linux has; +keep the analogies straight rather than transplanting Linux fixes. + +### `HIP_PATH` and multiple HIP SDK installs + +The HIP SDK installer drops files under +`C:\Program Files\AMD\ROCm\\` and sets `HIP_PATH` (and a +versioned `HIP_PATH_`) in the user/machine env. Multiple SDKs can +coexist on disk; whichever `HIP_PATH` points at is the one PyTorch and +`hipInfo.exe` actually load. Pointing it at the wrong major has the same +end result as `fix-8-wheel-rocm` -- `amdhip64_X.dll` from the SDK's `bin` +directory has the wrong major number for the installed framework. + +`examine.py` records the `HIP_PATH` env var alongside the discovered SDK +install path. When they disagree (`HIP_PATH` is set but `hip_sdk_path` +points at a different directory), surface both values to the user and let +them decide which one is right before any other fix. + +### PATH ordering on Windows + +Windows uses PATH for DLL search; there is no `LD_LIBRARY_PATH` analog. +If the user has more than one `...\AMD\ROCm\\bin` on PATH, the first +one wins for DLL resolution, which can be a different SDK than `HIP_PATH` +points at. The signal is the same as Linux's load-order conflict: a +`cannot find amdhip64_X.dll` error that doesn't go away after reinstalling +the right SDK. + +### Adrenalin pairing + +The user-space HIP SDK and the kernel-mode driver (Adrenalin / Adrenalin +Pro) have to match. AMD bumps the supported pairing every HIP SDK +release; the live table is in +. +We deliberately do NOT hardcode a minimum Adrenalin version in +`diagnose.py` -- the table goes stale within months. `fix-14-adrenalin-too-old` +triggers on observable failure (HIP SDK present + `hipInfo.exe` cannot +enumerate, or matching keyword in the user's symptom) and routes the user +to the live page. + +### MSVC redistributable + +The HIP SDK's `amdhip64_*.dll` links against the MSVC 2015-2022 runtime +(`vcruntime140.dll`, `vcruntime140_1.dll`). Without the redistributable, +`import torch` fails with a missing-DLL dialog that points at +`vcruntime140_1.dll`, not at the HIP runtime. `fix-15-msvc-redist` is +specifically the path for this -- do NOT route it to `fix-8-wheel-rocm` +even though the surface error involves a missing DLL. + +### `setx` does not affect open shells + +Both `apply_fix.py`'s Windows runners and the recipe `commands` use +`setx` to persist env vars. `setx` writes to the User registry but does +NOT update the current process or already-open shells. After running any +`setx`-based fix, instruct the user to close and reopen the terminal +before re-verifying. + +## Framework support matrix + +The skill's first decision is which framework the user is running. Only +the "yes" rows trigger system examination; the "no" rows route upstream +without running any local probes. + +| Framework | Examine the system? | Action | +|---|---|---| +| **PyTorch** (Linux ROCm wheels) | Yes | `python scripts/examine.py --framework pytorch` followed by `scripts/diagnose.py`. | +| **PyTorch** (Windows TheRock wheels) | Yes | Same scripts; on Windows `diagnose.py` filters the catalog to the cross-platform + Windows-only entries. | +| **llama.cpp** (built against system ROCm/HIP SDK) | Yes | `python scripts/examine.py --framework llama-cpp` followed by `scripts/diagnose.py`. | +| **Lemonade** | No -- ships its own ROCm | Route to + [Discord](https://discord.gg/5xXzkMu8Zk). | +| **LM Studio** | No -- ships its own runtime | Route to + Discord (in-app support, no public repo). | +| **Ollama** | No -- ships its own runtime | Route to + Discord. | +| **vLLM** | Out of scope until phase 1+ | Route to . | +| **SGLang** | Out of scope until phase 1+ | Route to . | + +If a Lemonade / LM Studio / Ollama user reports a problem AND a +standalone `rocminfo` (Linux) / `hipInfo.exe` (Windows) also fails (i.e. +the issue is the host install, not the bundled runtime), only then +escalate to a full examination. That is rare; the default action is +still to route upstream. + +## Device support, phased + +The skill ships in three phases. Phase 0 is the only one validated end +to end; later phases reuse the same scripts but loosen heuristics in +`diagnose.py`. + +| Phase | GPUs | Status | +|---|---|---| +| 0 | Ryzen AI APUs (Strix Halo, Strix Point, Krackan, Phoenix, Hawk Point) -- gfx1151 / gfx1150 / gfx1103 / gfx1036 | Validated. Default target. | +| 1 | Instinct (MI300X, MI300A, MI250, MI210) -- gfx942 / gfx90a | Scripts work; not validated against the full failure list. | +| 2 | Radeon dGPUs (RDNA3, RDNA4) -- gfx1100, gfx1101, gfx1102, gfx12xx | Scripts work; iGPU/dGPU collision logic specifically targets this case. | + +## Live AMD compatibility matrices + +Hand-typed kernel/ROCm/distro matrices in skill bodies go stale within +months. Always fetch live from these pages instead of inlining them: + +- **ROCm Linux system requirements** (kernel ranges, distro versions, + Python versions): +- **ROCm release compatibility matrix** (per-release driver / framework + versions): +- **RDNA3.5 system optimization** (APU-specific kernel notes referenced + by `apu-memory-tuner`): + +`diagnose.py`'s `fix-3-rocm-kernel` recipe always links to the first +page rather than asserting a fixed kernel floor. The same goes for +wheel-index URLs in `fix-1-arch` and `fix-8-wheel-rocm`. + +## Wheel index reference + +For `fix-1-arch` and `fix-8-wheel-rocm`, prefer indexes in this order: + +### Linux + +1. **Official PyTorch ROCm wheels** -- `https://download.pytorch.org/whl/rocm6.4` + (stable) and `https://download.pytorch.org/whl/nightly/rocm6.4` (nightly). + Replace `6.4` with the user's system ROCm major. +2. **TheRock per-gfx wheels** -- . + The recommended fallback when the official index doesn't yet cover + a gfx (typically true for newly released APUs in the first 2-3 ROCm + releases after launch). +3. **Build from source** -- last resort. Pin `PYTORCH_ROCM_ARCH=` + at build time, not at runtime. See the PyTorch ROCm build guide. + +### Windows + +1. **TheRock Windows wheels** -- . The + live source of truth for which gfx targets are supported on Windows + right now and which HIP SDK major each wheel pairs with. Always pull + the install command from the project README rather than asserting a + fixed `--index-url` here. +2. **Build from source** -- last resort. Requires Visual Studio Build + Tools, the HIP SDK on PATH, and `HIP_PATH` set. See the PyTorch ROCm + build guide for the Windows-specific environment variables. + +For llama.cpp: + +```bash +# Linux: +cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS= +cmake --build build -j +``` + +```powershell +# Windows: needs the HIP SDK installed and HIP_PATH set; targets MSVC. +cmake -B build -G "Visual Studio 17 2022" -DGGML_HIP=ON ` + -DAMDGPU_TARGETS= +cmake --build build --config Release +``` + +`AMDGPU_TARGETS` accepts a semicolon-separated list. Build a fat binary +for multiple GPUs with `-DAMDGPU_TARGETS=gfx1100;gfx1151`. + +## Upstream routing + +When `diagnose.py` returns no matches (exit 1), route the user to +exactly one upstream tracker rather than guessing. The mapping +`UPSTREAM_TRACKERS` in `diagnose.py` is the source of truth; the +abbreviated version: + +| Framework | Tracker | +|---|---| +| PyTorch | (tag with `rocm`) | +| llama.cpp | | +| Lemonade | | +| Ollama | | +| LM Studio | (in-app support) | +| ROCm core (default) | | + +Always attach the JSON from `python scripts/examine.py --json` to the +upstream report. It contains the kernel, GPU(s), ROCm version, install +method, framework version, and the env-var snapshot that the upstream +maintainer would otherwise have to ask for. + +## Why we do not auto-set `HSA_OVERRIDE_GFX_VERSION` + +This deserves its own callout because every other "ROCm not working" +tutorial on the internet suggests it as the first fix. We deliberately +suggest it last. + +`HSA_OVERRIDE_GFX_VERSION` works by tricking HSA into reporting the +override gfx string to user space. The compiler then emits ISA for the +*override* target. The hardware still executes the ISA it natively +supports. When the two are close (e.g. gfx1100 → gfx1030) most kernels +run; when they differ in subtle ways (register count, LDS layout, queue +size) you get OUT_OF_REGISTERS, page faults, or silently wrong results. + +Per the SCOPE document's success criteria: + +> The skill never proposes `HSA_OVERRIDE_GFX_VERSION` as the *first* +> fix when a native wheel exists for the user's `gfx` target. + +`diagnose.py`'s `fix-1-arch` recipe lists the override only in the notes +field, marked as a fallback when no native wheel exists. The auto-applied +path (`fix-2-unset-override`) is the OPPOSITE direction: removing the +override when the user already has one set unnecessarily. + +## Why WSL is out of scope + +`examine.py` detects WSL2 (via `microsoft` in `/proc/version` or +`WSL_DISTRO_NAME` in the environment) and exits 2 with a route-out +message. It does this on purpose: ROCm-on-WSL has its own failure modes +that are NOT in this catalog, and pretending they are Linux-native bugs +just gives users wrong fixes. + +What's actually different on WSL: + +- The kernel-mode driver lives on the **Windows host**, not in WSL. The + user needs a recent Adrenalin Pro / Adrenalin install on the host, plus + the WSL kernel update. None of those touch the WSL distro. +- `/dev/kfd` is replaced by `/dev/dxg` (the DirectX-on-WSL passthrough); + the `fix-4-render-group` and `fix-5-amdgpu-load` checks are wrong for + the wrong reasons. +- The HIP runtime libraries are loaded via `/usr/lib/wsl/lib/` rather + than `/opt/rocm/lib`, so an `LD_LIBRARY_PATH` debug session is + qualitatively different. + +If a WSL user really does need a host-level ROCm fix, the right path is +the WSL install guide: +. Once +those WSL-specific prereqs are in place, the user is back to running +either pure Windows (this skill) or pure native Linux (this skill); WSL +itself stays out of scope. + +## Adjacent problem: matrices in hand-typed tables + +Most of what this skill needs (supported GPUs, kernel ranges, ROCm +releases, wheel arch lists, gfx families) is scattered across hand-typed +tables in docs pages, READMEs, and release notes. Everyone re-parses the +same matrix, and they drift. + +The real fix is bigger than this skill: ROCm wants a **single, +agent-friendly source of truth** that feeds both the docs and skills like +`rocm-doctor`. Until that exists, the scripts here scrape +`rocm.docs.amd.com` at run time (`fix-3-rocm-kernel` links to the live +page rather than asserting a version) and the skill body is careful not +to assert a matrix that will be wrong in 90 days. + +When ROCm ships that source of truth, `examine.py` and `diagnose.py` +should switch to it. Until then, prefer "here is the live URL" over +"the supported kernels as of this writing are". diff --git a/skills/rocm-doctor/scripts/apply_fix.py b/skills/rocm-doctor/scripts/apply_fix.py new file mode 100644 index 0000000..71f8a43 --- /dev/null +++ b/skills/rocm-doctor/scripts/apply_fix.py @@ -0,0 +1,978 @@ +#!/usr/bin/env -S uv run --quiet +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +"""Apply a low-risk fix proposed by `diagnose.py`, or print the plan. + +This is the ONLY rocm-doctor script that can change the system. Every +diagnosis from `diagnose.py` carries a stable `fix_id`; pass it here: + + python scripts/apply_fix.py --fix-id fix-4-render-group + python scripts/apply_fix.py --fix-id fix-2-unset-override --dry-run + python scripts/apply_fix.py --list + +`--dry-run` is the default safety hatch: it prints the planned commands +and exits 0 without executing anything. Use it to show the user exactly +what would change. + +When a fix has `auto_applicable=False` (most of the structural fixes: +kernel-module blacklist, repo cleanup, multi-GPU IOMMU, amdgpu-install +rebuild), this script prints the commands and exits 0 without running +them, even without `--dry-run`. The user has to copy-paste, because the +risk of a half-applied state is too high for a tool to take. + +Each recipe carries an `applies_on` set of os_family values. `main` refuses +with exit 3 when the running OS isn't in that set, replacing the per-runner +platform.system() checks. Linux-only recipes (fix-3, -4, -5, -7, -10, -11, +-12) refuse on Windows; Windows-only recipes (fix-13, -14, -15) refuse on +Linux; the rest are cross-platform. + +Exit codes: + 0 = success (or dry-run finished, or fix is advisory-only). + 2 = unknown --fix-id. + 3 = required environment is missing (e.g. fix needs `sudo` and there's no + sudo, or fix doesn't apply to the running OS). + 4 = the underlying command exited non-zero; nothing was rolled back. + 5 = user declined the change at the interactive prompt. + +Design constraints: + - Never run anything `sudo` without printing the command first. + - Never modify the Windows registry, BIOS, or kernel cmdline non-interactively. + - Never restart services or reboot the machine. + - Never reinstall packages without an explicit --yes flag. + - Never silently fall through to an unrelated fix because the requested + one wasn't applicable -- exit 3 and tell the user why. +""" + +from __future__ import annotations + +import argparse +import json +import os +import platform +import re +import shutil +import subprocess +import sys +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class FixRecipe: + fix_id: str + title: str + rationale: str + auto_applicable: bool # True iff we can run the commands ourselves + commands: list[str] = field(default_factory=list) + needs_sudo: bool = False + needs_reboot: bool = False + needs_relogin: bool = False + verify: str = "" + notes: list[str] = field(default_factory=list) + # OS families this recipe applies on. `main` refuses (exit 3) when the + # running OS isn't in this set, replacing the per-runner platform.system() + # checks that used to live in each runner. + applies_on: frozenset[str] = field(default_factory=lambda: frozenset({"linux"})) + # When auto_applicable, this callable runs the actual change. It's + # invoked with (args, recipe) and must return an int exit code. We + # split this off from `commands` so we can compose multi-step actions + # (e.g. usermod followed by checking the resulting group list) without + # shelling out to bash. + runner: object = None # Callable[[argparse.Namespace, FixRecipe], int] + + +def _run(cmd: list[str], timeout: float = 60.0) -> tuple[int, str, str]: + try: + r = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, check=False, + ) + return r.returncode, r.stdout or "", r.stderr or "" + except (FileNotFoundError, subprocess.SubprocessError, OSError) as exc: + return 127, "", str(exc) + + +def _have(cmd: str) -> bool: + return shutil.which(cmd) is not None + + +def _confirm(prompt: str, assume_yes: bool) -> bool: + if assume_yes: + return True + if not sys.stdin.isatty(): + # Non-interactive context (CI, agent harness). Refuse to apply + # without explicit --yes; printing the plan is enough. + print("Non-interactive shell and --yes not passed; refusing to apply.") + return False + try: + ans = input(f"{prompt} [y/N]: ").strip().lower() + except EOFError: + return False + return ans in ("y", "yes") + + +def _print_recipe(r: FixRecipe) -> None: + print(f"Fix: {r.fix_id} -- {r.title}") + print(f"OS scope: {', '.join(sorted(r.applies_on))}") + print(f"Rationale: {r.rationale}") + if r.commands: + print("Commands:") + for c in r.commands: + print(f" $ {c}") + flags = [] + if r.needs_sudo: flags.append("requires sudo") + if r.needs_reboot: flags.append("requires reboot") + if r.needs_relogin: flags.append("requires re-login") + if not r.auto_applicable: flags.append("manual only (apply_fix.py will NOT run it)") + if flags: + print(f"Flags: {', '.join(flags)}") + for n in r.notes: + print(f"Note: {n}") + if r.verify: + print(f"Verify: {r.verify}") + + +# --------------------------------------------------------------------------- +# Runners. One per auto-applicable fix. +# +# Each runner returns the process exit code. It must: +# - Refuse to act when the platform isn't right (return 3). +# - Print every command it runs. +# - Respect args.dry_run. +# - Respect args.yes (skip the interactive confirm). +# --------------------------------------------------------------------------- + +def run_render_group(args, recipe: FixRecipe) -> int: + """fix-4: add the current user to the render group (and 'video' for safety).""" + user = os.environ.get("USER") or os.environ.get("LOGNAME") or "" + if not user: + print("Could not determine current user from $USER/$LOGNAME.") + return 3 + if not _have("usermod"): + print("`usermod` not on PATH; cannot add groups.") + return 3 + if not _have("sudo") and os.geteuid() != 0: + print("`sudo` is not on PATH and we are not root; cannot add groups.") + return 3 + + cmd_prefix = [] if os.geteuid() == 0 else ["sudo"] + cmd = cmd_prefix + ["usermod", "-a", "-G", "render,video", user] + print("Will run:", " ".join(cmd)) + if args.dry_run: + print("(dry-run; not executed)") + return 0 + if not _confirm("Add user to render,video groups?", args.yes): + return 5 + rc, out, err = _run(cmd, timeout=20) + if out: sys.stdout.write(out) + if err: sys.stderr.write(err) + if rc != 0: + print(f"usermod exited {rc}; group membership NOT changed.") + return 4 + print(f"Added {user} to render,video.") + print( + "IMPORTANT: log out and back in (or reboot) for the membership to " + "take effect in new shells and services. `newgrp render` patches " + "the current shell only." + ) + return 0 + + +def run_unset_override(args, recipe: FixRecipe) -> int: + """fix-2: unset HSA_OVERRIDE_GFX_VERSION for future shells. + + We can only affect THIS process. Persisting the unset requires editing + user dotfiles (Linux) or the per-user environment registry (Windows), + which we never do unannounced. We instead: + Linux: + 1. Inspect ~/.bashrc, ~/.zshrc, ~/.profile, ~/.config/fish/config.fish + for an `export HSA_OVERRIDE_GFX_VERSION=...` line. + 2. Print exact $EDITOR instructions for any hit. + Windows: + 1. Read the User and Machine env scopes via PowerShell. + 2. Tell the user which scope still holds the value and how to clear + it (`setx HSA_OVERRIDE_GFX_VERSION ""` or System Properties UI). + """ + if platform.system().lower() == "windows": + return _run_unset_override_windows(args, recipe) + return _run_unset_override_linux(args, recipe) + + +def _run_unset_override_linux(args, recipe: FixRecipe) -> int: + current = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "") + if not current: + print("HSA_OVERRIDE_GFX_VERSION is already unset in this shell.") + else: + print(f"HSA_OVERRIDE_GFX_VERSION={current} is set in this shell.") + print("In your current shell, run:") + print(" unset HSA_OVERRIDE_GFX_VERSION") + print("(This script can't unset it in your parent shell; it only sees a copy.)") + + candidates = [ + Path.home() / ".bashrc", + Path.home() / ".bash_profile", + Path.home() / ".zshrc", + Path.home() / ".profile", + Path.home() / ".config" / "fish" / "config.fish", + ] + rc_hits: list[Path] = [] + for f in candidates: + if not f.exists(): + continue + try: + body = f.read_text(encoding="utf-8", errors="replace") + except OSError: + continue + if re.search(r"HSA_OVERRIDE_GFX_VERSION", body): + rc_hits.append(f) + + if not rc_hits: + print("\nNo persistent HSA_OVERRIDE_GFX_VERSION found in your shell rc files.") + return 0 + + print("\nPersistent HSA_OVERRIDE_GFX_VERSION found in:") + for f in rc_hits: + print(f" - {f}") + print( + "\nRemove or comment those lines manually. apply_fix.py does NOT edit " + "your shell rc files for you; that's your dotfiles. Suggested:" + ) + for f in rc_hits: + print(f" $ $EDITOR {f} # delete or comment the HSA_OVERRIDE_GFX_VERSION line") + return 0 + + +def _run_unset_override_windows(args, recipe: FixRecipe) -> int: + current = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "") + if current: + print(f"HSA_OVERRIDE_GFX_VERSION={current} is set in this shell.") + print("Note: clearing it in your Windows env scope does NOT affect this") + print("already-open shell -- close and reopen your terminal afterwards.") + else: + print("HSA_OVERRIDE_GFX_VERSION is not set in this shell.") + + user_val = "" + machine_val = "" + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','User')", + ], timeout=8) + if rc == 0: + user_val = out.strip() + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','Machine')", + ], timeout=8) + if rc == 0: + machine_val = out.strip() + + if not user_val and not machine_val: + print("\nNo persistent HSA_OVERRIDE_GFX_VERSION found in either the User") + print("or Machine env scope. You're done after closing/reopening shells.") + return 0 + + print("\nPersistent HSA_OVERRIDE_GFX_VERSION found in:") + if user_val: + print(f" User scope: {user_val}") + if machine_val: + print(f" Machine scope: {machine_val}") + + if user_val: + print('\nClear from the User scope (no admin needed):') + print(' Will run: setx HSA_OVERRIDE_GFX_VERSION ""') + if args.dry_run: + print(" (dry-run; not executed)") + elif _confirm("Clear HSA_OVERRIDE_GFX_VERSION from User scope?", args.yes): + rc, out, err = _run(["setx", "HSA_OVERRIDE_GFX_VERSION", ""], timeout=15) + if out: sys.stdout.write(out) + if err: sys.stderr.write(err) + if rc != 0: + print(f"setx exited {rc}; User scope NOT changed.") + return 4 + print("Cleared from User scope. Reopen your terminal for it to take effect.") + + if machine_val: + print( + "\nThe Machine scope value cannot be cleared without an Admin shell. " + "Either run an elevated PowerShell and execute:" + ) + print(" [Environment]::SetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION', $null, 'Machine')") + print( + "or remove it through System Properties -> Environment Variables -> " + "System variables. apply_fix.py does NOT elevate itself." + ) + return 0 + + +def run_path_export(args, recipe: FixRecipe) -> int: + """fix-6: persist the ROCm/HIP bin directory on PATH (with consent).""" + if platform.system().lower() == "windows": + return _run_path_export_windows(args, recipe) + return _run_path_export_linux(args, recipe) + + +def _run_path_export_linux(args, recipe: FixRecipe) -> int: + """Append `/opt/rocm/bin` to ~/.bashrc (or ~/.zshrc). + + Simplest possible thing: append a single line. We never reorder PATH, + we never edit /etc/environment. If the line is already there we exit + 0 without re-appending. + """ + bin_dir = "/opt/rocm/bin" + if not Path(bin_dir).is_dir(): + print(f"{bin_dir} does not exist; nothing to add to PATH.") + return 3 + + shell = os.environ.get("SHELL", "") + rc_file = Path.home() / (".zshrc" if "zsh" in shell else ".bashrc") + if not rc_file.exists() and (Path.home() / ".bashrc").exists(): + rc_file = Path.home() / ".bashrc" + + export_line = f'export PATH="{bin_dir}:$PATH"' + existing = "" + if rc_file.exists(): + try: + existing = rc_file.read_text(encoding="utf-8", errors="replace") + except OSError as exc: + print(f"Could not read {rc_file}: {exc}") + return 3 + if re.search(rf"PATH=.*{re.escape(bin_dir)}", existing): + print(f"{rc_file} already adds {bin_dir} to PATH; no change.") + return 0 + + print(f"Plan: append the following line to {rc_file}:") + print(f" {export_line}") + if args.dry_run: + print("(dry-run; not executed)") + return 0 + if not _confirm(f"Append to {rc_file}?", args.yes): + return 5 + + try: + with rc_file.open("a", encoding="utf-8") as fh: + fh.write(f"\n# Added by rocm-doctor (apply_fix.py fix-6-path)\n") + fh.write(export_line + "\n") + except OSError as exc: + print(f"Failed to write {rc_file}: {exc}") + return 4 + + print( + f"Appended to {rc_file}. Open a new shell or run `source {rc_file}` " + "for the change to take effect." + ) + return 0 + + +def _run_path_export_windows(args, recipe: FixRecipe) -> int: + """Append the HIP SDK's bin directory to the User PATH via setx. + + `setx` is the only documented way to persist a User env var on + Windows without elevation. It rewrites the whole variable; here we + fetch the current User-scope PATH first, append our directory if it + isn't there yet, and write the result back. + """ + sdk_path = os.environ.get("HIP_PATH", "") + if not sdk_path: + for root in (r"C:\Program Files\AMD\ROCm", r"C:\Program Files (x86)\AMD\ROCm"): + try: + base = Path(root) + if base.is_dir(): + for child in sorted(base.iterdir(), reverse=True): + if child.is_dir() and re.match(r"\d+(\.\d+)+", child.name): + sdk_path = str(child) + break + except OSError: + continue + if sdk_path: + break + if not sdk_path: + print("No HIP SDK install found. Run fix-13-hip-sdk-missing first.") + return 3 + bin_dir = str(Path(sdk_path) / "bin") + if not Path(bin_dir).is_dir(): + print(f"{bin_dir} does not exist on disk; HIP SDK install looks incomplete.") + return 3 + + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "[Environment]::GetEnvironmentVariable('PATH','User')", + ], timeout=8) + user_path = out.strip() if rc == 0 else "" + if user_path and bin_dir.lower() in user_path.lower(): + print(f"User PATH already contains {bin_dir}; no change.") + return 0 + new_path = (user_path + ";" + bin_dir).lstrip(";") if user_path else bin_dir + + print(f"Plan: prepend {bin_dir} to your User PATH:") + print(f" setx PATH \"{new_path}\"") + if args.dry_run: + print("(dry-run; not executed)") + return 0 + if not _confirm("Update User PATH?", args.yes): + return 5 + + rc, out, err = _run(["setx", "PATH", new_path], timeout=15) + if out: sys.stdout.write(out) + if err: sys.stderr.write(err) + if rc != 0: + print(f"setx exited {rc}; User PATH NOT changed.") + return 4 + print( + f"Added {bin_dir} to your User PATH. setx only takes effect in NEW " + "shells -- close this terminal and reopen it before re-running hipInfo." + ) + return 0 + + +def run_hip_visible_devices(args, recipe: FixRecipe) -> int: + """fix-9: persist HIP_VISIBLE_DEVICES so the iGPU is hidden. + + We DO NOT pick a device index automatically -- rocminfo / hipInfo + ordering can surprise even experienced users on dual-GPU laptops. + Instead, we print a guided query and accept --device-index as the + explicit input. + """ + if platform.system().lower() == "windows": + return _run_hip_visible_devices_windows(args, recipe) + return _run_hip_visible_devices_linux(args, recipe) + + +def _run_hip_visible_devices_linux(args, recipe: FixRecipe) -> int: + idx = args.device_index + if idx is None: + print( + "Run `rocminfo | grep -E 'Agent |Marketing|gfx'` and identify the " + "row of your DISCRETE GPU (the iGPU is typically Agent 1). Then " + "re-run apply_fix.py with --device-index N." + ) + return 3 + + shell = os.environ.get("SHELL", "") + rc_file = Path.home() / (".zshrc" if "zsh" in shell else ".bashrc") + if not rc_file.exists() and (Path.home() / ".bashrc").exists(): + rc_file = Path.home() / ".bashrc" + + export_line = f'export HIP_VISIBLE_DEVICES={idx}' + if rc_file.exists(): + try: + existing = rc_file.read_text(encoding="utf-8", errors="replace") + except OSError as exc: + print(f"Could not read {rc_file}: {exc}") + return 3 + if re.search(r"HIP_VISIBLE_DEVICES=", existing): + print( + f"{rc_file} already sets HIP_VISIBLE_DEVICES; edit by hand " + "rather than appending a second copy." + ) + return 0 + + print(f"Plan: append the following line to {rc_file}:") + print(f" {export_line}") + if args.dry_run: + print("(dry-run; not executed)") + return 0 + if not _confirm(f"Append to {rc_file}?", args.yes): + return 5 + try: + with rc_file.open("a", encoding="utf-8") as fh: + fh.write("\n# Added by rocm-doctor (apply_fix.py fix-9-igpu-dgpu)\n") + fh.write(export_line + "\n") + except OSError as exc: + print(f"Failed to write {rc_file}: {exc}") + return 4 + print( + f"Appended to {rc_file}. Open a new shell for the change to take effect, " + "then re-run your workload." + ) + return 0 + + +def _run_hip_visible_devices_windows(args, recipe: FixRecipe) -> int: + idx = args.device_index + if idx is None: + print( + "Run the following to identify the discrete GPU's index:" + ) + print( + ' & "$env:HIP_PATH\\bin\\hipInfo.exe" | ' + 'Select-String "device#|Name|gcnArchName"' + ) + print( + "Then re-run apply_fix.py with --device-index N (the iGPU is " + "typically device# 0; the dGPU is usually device# 1)." + ) + return 3 + + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "[Environment]::GetEnvironmentVariable('HIP_VISIBLE_DEVICES','User')", + ], timeout=8) + existing = out.strip() if rc == 0 else "" + if existing: + print( + f"User scope already sets HIP_VISIBLE_DEVICES={existing!r}; " + "remove or update it manually rather than overwriting from this script." + ) + return 0 + + print("Plan: persist HIP_VISIBLE_DEVICES in the User env scope:") + print(f" setx HIP_VISIBLE_DEVICES {idx}") + if args.dry_run: + print("(dry-run; not executed)") + return 0 + if not _confirm("Set HIP_VISIBLE_DEVICES in the User scope?", args.yes): + return 5 + + rc, out, err = _run(["setx", "HIP_VISIBLE_DEVICES", str(idx)], timeout=15) + if out: sys.stdout.write(out) + if err: sys.stderr.write(err) + if rc != 0: + print(f"setx exited {rc}; HIP_VISIBLE_DEVICES NOT changed.") + return 4 + print( + "setx only takes effect in NEW shells -- close this terminal and " + "reopen it before re-running your workload." + ) + return 0 + + +# --------------------------------------------------------------------------- +# Recipe registry. Mirrors the diagnosis catalog in `diagnose.py`. Only the +# small, safe, well-bounded fixes are auto-applicable; everything else is +# advisory and prints the plan only. +# --------------------------------------------------------------------------- + +LINUX_AND_WINDOWS = frozenset({"linux", "windows"}) +LINUX_ONLY = frozenset({"linux"}) +WINDOWS_ONLY = frozenset({"windows"}) + + +RECIPES: dict[str, FixRecipe] = { + "fix-1-arch": FixRecipe( + fix_id="fix-1-arch", + title="GPU gfx target not in framework arch list", + rationale=( + "Your GPU's gfx target is not in the framework wheel's compiled " + "kernel list. Re-install the framework from an index that includes " + "this gfx, OR rebuild llama.cpp with AMDGPU_TARGETS=." + ), + auto_applicable=False, + applies_on=LINUX_AND_WINDOWS, + commands=[ + "# PyTorch (Linux): switch to the ROCm nightly that ships the gfx115x kernels.", + "pip uninstall -y torch torchvision torchaudio", + "pip install --pre torch torchvision torchaudio \\", + " --index-url https://download.pytorch.org/whl/nightly/rocm6.4", + "# PyTorch (Windows): use TheRock's per-gfx wheels (https://github.com/ROCm/TheRock).", + "# llama.cpp:", + "# cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=", + "# cmake --build build -j", + ], + notes=[ + "TheRock per-gfx wheels are the recommended fallback when the " + "official pytorch index does not yet cover your gfx (and the only " + "first-party option on Windows AMD).", + "HSA_OVERRIDE_GFX_VERSION is NOT the right fix here -- it papers " + "over the mismatch and risks page faults at runtime.", + ], + verify="python -c \"import torch; print(torch.cuda.is_available(), torch.cuda.get_arch_list())\"", + ), + "fix-2-unset-override": FixRecipe( + fix_id="fix-2-unset-override", + title="Unset HSA_OVERRIDE_GFX_VERSION", + rationale=( + "HSA_OVERRIDE_GFX_VERSION is set, but your GPU now has a native " + "wheel. The override hides the real gfx and causes page faults / " + "OUT_OF_REGISTERS at runtime." + ), + auto_applicable=True, + applies_on=LINUX_AND_WINDOWS, + commands=[ + "# Linux:", + "unset HSA_OVERRIDE_GFX_VERSION", + "# Then remove the line from ~/.bashrc / ~/.zshrc / ~/.profile.", + "# Windows:", + 'setx HSA_OVERRIDE_GFX_VERSION ""', + "# Or remove via System Properties -> Environment Variables.", + ], + runner=run_unset_override, + verify="env | grep HSA_OVERRIDE_GFX_VERSION || echo OK_UNSET", + ), + "fix-3-rocm-kernel": FixRecipe( + fix_id="fix-3-rocm-kernel", + title="ROCm/distro/kernel triple unsupported", + rationale=( + "ROCm is installed but your kernel/distro combination is outside " + "the supported matrix. Match the kernel to the matrix before " + "reinstalling, or rerun with --no-dkms and accept the risk." + ), + auto_applicable=False, + applies_on=LINUX_ONLY, + commands=[ + "# Cross-check the live AMD matrix before changing anything:", + "# https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html", + "# Common fix on Ubuntu: install the HWE kernel that matches your ROCm release, then reboot.", + ], + needs_reboot=True, + verify="lsmod | grep amdgpu && rocminfo | head -n 5", + ), + "fix-4-render-group": FixRecipe( + fix_id="fix-4-render-group", + title="Add user to render/video groups", + rationale=( + "The current user can't open /dev/kfd because they aren't in the " + "render group. Adding the user is the safe, standard fix." + ), + auto_applicable=True, + applies_on=LINUX_ONLY, + commands=['sudo usermod -a -G render,video "$USER"'], + needs_sudo=True, + needs_relogin=True, + runner=run_render_group, + verify="groups | tr ' ' '\\n' | grep -E '^(render|video)$' && rocminfo | head -n 5", + ), + "fix-5-amdgpu-load": FixRecipe( + fix_id="fix-5-amdgpu-load", + title="Load amdgpu (and clear any blacklist)", + rationale=( + "The amdgpu kernel module is not loaded. Check /etc/modprobe.d " + "for a blacklist entry, regenerate the initramfs, and modprobe." + ), + auto_applicable=False, + applies_on=LINUX_ONLY, + commands=[ + "grep -RIl 'blacklist amdgpu' /etc/modprobe.d /usr/lib/modprobe.d 2>/dev/null || true", + "sudo $EDITOR # remove the blacklist line", + "sudo update-initramfs -u # Debian/Ubuntu", + "sudo dracut -f # Fedora/RHEL", + "sudo modprobe amdgpu", + ], + needs_sudo=True, + needs_reboot=True, + verify="lsmod | grep amdgpu && rocminfo | head -n 5", + notes=[ + "If Secure Boot is enabled and amdgpu still won't load, the DKMS " + "module isn't signed. Either sign it with mokutil or disable " + "Secure Boot in firmware.", + ], + ), + "fix-6-path": FixRecipe( + fix_id="fix-6-path", + title="Add the ROCm/HIP bin directory to PATH", + rationale=( + "Linux: ROCm is installed at /opt/rocm but its bin directory isn't " + "on PATH, so `rocminfo` / `hipcc` aren't visible to the shell. " + "Windows: the HIP SDK is installed but its bin directory isn't on " + "the User PATH, so `hipInfo.exe` and the runtime DLLs can't be found." + ), + auto_applicable=True, + applies_on=LINUX_AND_WINDOWS, + commands=[ + "# Linux:", + 'echo \'export PATH="/opt/rocm/bin:$PATH"\' >> ~/.bashrc', + "# Windows:", + 'setx PATH "%PATH%;C:\\Program Files\\AMD\\ROCm\\\\bin"', + ], + runner=run_path_export, + verify="rocminfo | head -n 5 && hipcc --version", + ), + "fix-7-stale-repos": FixRecipe( + fix_id="fix-7-stale-repos", + title="Quarantine duplicate AMD repos", + rationale=( + "More than one ROCm/AMDGPU repo file exists. The package manager " + "is mixing versions; quarantine the extras before reinstalling." + ), + auto_applicable=False, + applies_on=LINUX_ONLY, + commands=[ + "ls /etc/apt/sources.list.d/ | grep -iE 'rocm|amdgpu|radeon'", + "# For each duplicate file:", + "sudo mv /etc/apt/sources.list.d/.list /etc/apt/sources.list.d/.list.bak", + "sudo apt update", + ], + needs_sudo=True, + verify="sudo apt update 2>&1 | tail -n 20", + ), + "fix-8-wheel-rocm": FixRecipe( + fix_id="fix-8-wheel-rocm", + title="Reinstall the framework against the system ROCm/HIP major", + rationale=( + "The framework's bundled HIP version doesn't match the system " + "ROCm (Linux) or HIP SDK (Windows). libamdhip64.so.X / " + "amdhip64_X.dll load failures are the usual signal." + ), + auto_applicable=False, + applies_on=LINUX_AND_WINDOWS, + commands=[ + "pip uninstall -y torch torchvision torchaudio", + "# Linux: pick the index that matches your system ROCm major:", + "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4", + "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3", + "# Windows: use TheRock's wheels matching your HIP SDK major:", + "# https://github.com/ROCm/TheRock", + ], + verify="python -c \"import torch; print(torch.__version__, torch.version.hip, torch.cuda.is_available())\"", + ), + "fix-9-igpu-dgpu": FixRecipe( + fix_id="fix-9-igpu-dgpu", + title="Hide the iGPU with HIP_VISIBLE_DEVICES", + rationale=( + "Both an APU iGPU and a discrete AMD GPU are visible. Pin the " + "runtime to the dGPU so the iGPU doesn't destabilise it." + ), + auto_applicable=True, + applies_on=LINUX_AND_WINDOWS, + commands=[ + "# Linux:", + "rocminfo | grep -E 'Agent |Marketing|gfx' # find the dGPU index", + "export HIP_VISIBLE_DEVICES=", + "# Windows:", + '& "$env:HIP_PATH\\bin\\hipInfo.exe" | Select-String "device#|Name"', + "setx HIP_VISIBLE_DEVICES ", + ], + runner=run_hip_visible_devices, + verify="python -c \"import torch; print(torch.cuda.device_count(), torch.cuda.get_device_name(0))\"", + notes=[ + "Pass --device-index N to persist the env var; without it, " + "this fix only prints the rocminfo / hipInfo query so you can identify N.", + ], + ), + "fix-10-container": FixRecipe( + fix_id="fix-10-container", + title="Re-launch the container with AMD devices passed through", + rationale=( + "The container can't see /dev/kfd or /dev/dri/renderD*. Pass the " + "devices and the host's render group via the runtime flags." + ), + auto_applicable=False, + applies_on=LINUX_ONLY, + commands=[ + "docker run --rm -it \\", + " --device=/dev/kfd \\", + " --device=/dev/dri \\", + " --group-add render \\", + " --security-opt seccomp=unconfined \\", + " --shm-size=8g \\", + " rocm/pytorch:latest", + ], + verify="rocminfo | head -n 5", + notes=[ + "Rootless podman additionally needs `--userns=keep-id` and a " + "host user that is in the render group; podman maps it through.", + ], + ), + "fix-11-iommu": FixRecipe( + fix_id="fix-11-iommu", + title="Add iommu=pt to the kernel command line", + rationale=( + "Multi-GPU jobs hang when the IOMMU is in the default 'on' mode " + "with translation; pass-through mode fixes the hang. This requires " + "editing GRUB and rebooting; we will not do that for you." + ), + auto_applicable=False, + applies_on=LINUX_ONLY, + commands=[ + "cat /proc/cmdline", + "sudo $EDITOR /etc/default/grub # add iommu=pt to GRUB_CMDLINE_LINUX_DEFAULT", + "sudo update-grub # Debian/Ubuntu", + "sudo grub2-mkconfig -o /boot/grub2/grub.cfg # Fedora/RHEL", + "# Reboot, then retry the multi-GPU workload.", + ], + needs_sudo=True, + needs_reboot=True, + verify="cat /proc/cmdline | grep -o 'iommu=\\w*'", + ), + "fix-12-installer": FixRecipe( + fix_id="fix-12-installer", + title="Reset amdgpu-install state and reinstall", + rationale=( + "amdgpu-install left a half-configured DKMS / repo state. Run " + "the documented uninstall, clean up, and reinstall without the " + "flag that broke things (commonly --accept-eula on newer installers)." + ), + auto_applicable=False, + applies_on=LINUX_ONLY, + commands=[ + "sudo amdgpu-install --uninstall", + "sudo apt autoremove --purge -y", + "sudo apt update", + "sudo amdgpu-install --usecase=rocm,hip", + ], + needs_sudo=True, + needs_reboot=True, + verify="dpkg -l | grep -E 'rocm|amdgpu' | head -n 20 && rocminfo | head -n 5", + notes=[ + "If `apt autoremove --purge` warns it will remove unrelated " + "packages, stop and resolve those by hand before continuing.", + ], + ), + "fix-13-hip-sdk-missing": FixRecipe( + fix_id="fix-13-hip-sdk-missing", + title="Install the AMD HIP SDK for Windows", + rationale=( + "Your framework links against HIP but the HIP SDK isn't installed " + "on this host. The runtime DLLs (amdhip64_X.dll, hipblas.dll, " + "hsa-runtime64.dll) and hipInfo.exe ship inside the SDK installer." + ), + auto_applicable=False, + applies_on=WINDOWS_ONLY, + commands=[ + "# Download and install the HIP SDK (matched to your framework's HIP major):", + "# https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html", + "# After install, reopen the shell so HIP_PATH and PATH pick up the new install.", + ], + verify=( + 'powershell -NoProfile -Command ' + '"& \\"$env:HIP_PATH\\bin\\hipInfo.exe\\" | Select-Object -First 5"' + ), + notes=[ + "If you only need PyTorch on Windows AMD and don't need the C/C++ " + "HIP toolchain, the TheRock wheels bundle their own HIP runtime " + "and may not require a system HIP SDK install.", + ], + ), + "fix-14-adrenalin-too-old": FixRecipe( + fix_id="fix-14-adrenalin-too-old", + title="Update the Adrenalin / kernel-mode driver", + rationale=( + "The HIP SDK is installed but the AMD kernel-mode driver " + "(Adrenalin / Adrenalin Pro) is older than the SDK release notes " + "call out. The user-space SDK and the driver have to match." + ), + auto_applicable=False, + applies_on=WINDOWS_ONLY, + commands=[ + "# Cross-check the HIP SDK release notes for the exact driver pairing:", + "# https://rocm.docs.amd.com/projects/install-on-windows/en/latest/install/install.html", + "# Then download the matching driver from:", + "# https://www.amd.com/en/support", + "# Reboot after the install for the kernel-mode driver to take effect.", + ], + needs_reboot=True, + verify=( + 'powershell -NoProfile -Command ' + '"(Get-CimInstance Win32_VideoController | ' + "Where-Object { $_.Name -like '*AMD*' -or $_.Name -like '*Radeon*' } | " + 'Select-Object -First 1).DriverVersion"' + ), + ), + "fix-15-msvc-redist": FixRecipe( + fix_id="fix-15-msvc-redist", + title="Install the MSVC 2015-2022 runtime redistributable", + rationale=( + "The HIP SDK's amdhip64_X.dll links against the MSVC 2015-2022 " + "runtime. When vcruntime140.dll / vcruntime140_1.dll aren't on " + "PATH, `import torch` fails with a missing-DLL error that points " + "at vcruntime140_1.dll, not at the HIP runtime itself." + ), + auto_applicable=False, + applies_on=WINDOWS_ONLY, + commands=[ + "# Download and install (x64):", + "# https://aka.ms/vs/17/release/vc_redist.x64.exe", + "# After the install, reopen the shell and re-run your import / hipInfo check.", + ], + verify="where vcruntime140.dll && where vcruntime140_1.dll", + notes=[ + "If installing the redistributable still leaves a missing-DLL " + "error, the failing DLL is probably amdhip64_X.dll itself; that " + "points at fix-13-hip-sdk-missing rather than this fix.", + ], + ), +} + + +def _list_recipes() -> None: + print("Available fix-ids (mirror diagnose.py):") + for r in RECIPES.values(): + kind = "AUTO" if r.auto_applicable else "PRINT-ONLY" + scope = "/".join(sorted(r.applies_on)) + print(f" [{kind:>10s}] [{scope:>14s}] {r.fix_id} -- {r.title}") + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--fix-id", + help="Stable fix identifier from diagnose.py (e.g. fix-4-render-group).", + ) + parser.add_argument( + "--list", action="store_true", + help="List every fix-id and exit.", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Show the plan without changing anything.", + ) + parser.add_argument( + "--yes", action="store_true", + help="Skip the interactive confirmation. Use only when the user has " + "already approved the plan in chat.", + ) + parser.add_argument( + "--device-index", type=int, default=None, + help="For fix-9-igpu-dgpu: the rocminfo Agent index of the discrete GPU.", + ) + parser.add_argument( + "--json", action="store_true", + help="Emit the recipe as JSON instead of running it.", + ) + args = parser.parse_args(argv) + + if args.list: + _list_recipes() + return 0 + + if not args.fix_id: + parser.error("--fix-id or --list is required") + + recipe = RECIPES.get(args.fix_id) + if recipe is None: + print(f"Unknown fix-id: {args.fix_id}", file=sys.stderr) + print("Run `python scripts/apply_fix.py --list` for the full list.", file=sys.stderr) + return 2 + + if args.json: + # Strip the runner callable; it isn't JSON-serialisable. Convert + # the frozenset for `applies_on` into a sorted list so the JSON + # output is stable. + d = {} + for k, v in recipe.__dict__.items(): + if k == "runner": + continue + if k == "applies_on": + d[k] = sorted(v) + else: + d[k] = v + print(json.dumps(d, indent=2)) + return 0 + + _print_recipe(recipe) + print() + + sysname = platform.system().lower() + if sysname not in recipe.applies_on: + print( + f"This fix only applies on: {', '.join(sorted(recipe.applies_on))}. " + f"Running OS is: {sysname}." + ) + return 3 + + if not recipe.auto_applicable: + print("This fix is print-only (manual change required).") + print("Copy the commands above, run them yourself, then verify with:") + if recipe.verify: + print(f" $ {recipe.verify}") + return 0 + + if recipe.runner is None: + # Defensive: an auto_applicable recipe with no runner is a bug. + print("Internal error: auto-applicable recipe has no runner.", file=sys.stderr) + return 4 + return recipe.runner(args, recipe) # type: ignore[misc] + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/rocm-doctor/scripts/diagnose.py b/skills/rocm-doctor/scripts/diagnose.py new file mode 100644 index 0000000..54f4982 --- /dev/null +++ b/skills/rocm-doctor/scripts/diagnose.py @@ -0,0 +1,1333 @@ +#!/usr/bin/env -S uv run --quiet +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +"""Match an `examine.py` snapshot against the rocm-doctor failure-mode list. + +This script is the opinionated decision tree the `rocm-doctor` skill is +built around. It takes: + + 1. The JSON output of `examine.py` (machine state). + 2. Optionally the user's error text (symptom). + +and returns a ranked list of matches against the catalog of known +misconfigurations in `reference.md`. Each match comes with: + + - id : stable identifier reused by `apply_fix.py` (e.g. "fix-4-render-group"). + - title : one-line description of the failure mode. + - score : 0..100 confidence the user is hitting this case. + - evidence : the concrete facts the score is based on. + - fix : the next action and a `verify` command the agent can re-run. + +Usage: + python scripts/examine.py --json > exam.json + python scripts/diagnose.py --exam exam.json + python scripts/diagnose.py --exam exam.json --symptom "HIP error: invalid device function" + python scripts/diagnose.py --exam exam.json --json + python scripts/diagnose.py --exam exam.json --top 3 + +Exit codes: + 0 = at least one diagnosis matched (score >= MIN_SCORE_FOR_MATCH). + 1 = nothing matched; this is the explicit "I don't recognise this failure + mode" path. The agent should NOT speculate; it should hand the user + the upstream tracker URL printed by --json. + 2 = exam JSON is missing or malformed. + +The closed list is deliberate. New failure modes go through a code change +here; they do not get invented by the agent at runtime. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Callable + +# A score above this threshold is treated as "we think this is it". +# Tuned so that a single direct symptom keyword match (worth ~40) plus a +# corroborating state signal (worth ~20+) is enough to surface a diagnosis. +MIN_SCORE_FOR_MATCH = 50 + +# Above this score we tell the agent to propose the fix immediately; below +# it (but above MIN_SCORE_FOR_MATCH) we surface as "likely" and ask the +# user to confirm one more piece of evidence first. +HIGH_CONFIDENCE = 75 + +# Upstream router used when nothing matches. Keeping the URL list short so +# the agent has exactly one place to send each kind of report. +UPSTREAM_TRACKERS = { + "rocm-core": "https://github.com/ROCm/ROCm/issues", + "pytorch": "https://github.com/pytorch/pytorch/issues (tag with rocm label)", + "llama-cpp": "https://github.com/ggml-org/llama.cpp/issues", + "lemonade": "https://github.com/lemonade-sdk/lemonade/issues", + "ollama": "https://github.com/ollama/ollama/issues", + "lm-studio": "https://lmstudio.ai/docs/app (use in-app support; no public repo)", + "amdgpu-install": "https://repo.radeon.com (raise via your AMD support contact)", +} + + +@dataclass +class Fix: + summary: str # one-line plan + commands: list[str] = field(default_factory=list) + needs_sudo: bool = False + needs_reboot: bool = False + needs_relogin: bool = False + fix_id: str = "" # passed to apply_fix.py --fix-id + auto_applicable: bool = False # True iff apply_fix.py can run it + notes: list[str] = field(default_factory=list) + verify: str = "" # command the agent should run after + + +@dataclass +class Diagnosis: + id: str + title: str + score: int + evidence: list[str] = field(default_factory=list) + fix: Fix | None = None + + +# --------------------------------------------------------------------------- +# Symptom keyword tables. Each tuple is (regex, weight, label-for-evidence). +# Weights are tuned so that one specific error message (libamdhip64.so.X, +# HSA_STATUS_ERROR_INVALID_ISA) is enough to dominate the diagnosis on its +# own, while vague matches (the word "hang") only nudge the score. +# --------------------------------------------------------------------------- + +KEYWORDS_INVALID_ISA = [ + (r"hiperrornobinaryforgpu", 45, "error mentions hipErrorNoBinaryForGpu"), + (r"hsa_status_error_invalid_isa", 50, "error mentions HSA_STATUS_ERROR_INVALID_ISA"), + (r"invalid device function", 40, "error mentions 'invalid device function'"), + (r"no kernel image is available", 35, "error mentions 'no kernel image is available'"), + (r"gfx\d{3,4}.* not (?:in|on) .*arch", 35, "error names a missing gfx in arch list"), +] + +KEYWORDS_KFD_PERMISSION = [ + (r"unable to open /dev/kfd", 50, "error mentions /dev/kfd open failure"), + (r"/dev/kfd.*permission denied", 45, "error mentions /dev/kfd permission denied"), + (r"hsa_status_error_out_of_resources", 25, "HSA out-of-resources (often perms)"), + (r"failed to open kfd", 35, "error mentions kfd open failure"), +] + +KEYWORDS_MODULE_NOT_LOADED = [ + (r"rock module is not loaded", 50, "rocminfo says ROCk module is NOT loaded"), + (r"no devices? found", 20, "vague 'no devices found'"), + (r"hsa_status_error", 10, "HSA error (broad)"), +] + +KEYWORDS_PATH_MISSING = [ + (r"rocminfo: command not found", 50, "rocminfo not on PATH"), + (r"command not found.*hipcc", 40, "hipcc not on PATH"), + (r"/opt/rocm/bin", 15, "user mentions /opt/rocm/bin"), +] + +KEYWORDS_LIB_MISMATCH = [ + (r"libamdhip64\.so", 50, "error mentions libamdhip64.so"), + (r"libhsa-runtime", 45, "error mentions libhsa-runtime"), + (r"libhipblas", 40, "error mentions libhipblas"), + (r"amdhip64_\d+\.dll", 50, "error mentions amdhip64_X.dll (Windows)"), + (r"hipblas\.dll", 40, "error mentions hipblas.dll (Windows)"), + (r"cannot open shared object file", 25, "ldopen failure"), + (r"dll load failed", 25, "Windows DLL load failure"), + (r"version `?glibc", 5, "tangential glibc version error"), +] + +KEYWORDS_HIP_SDK_MISSING = [ + (r"amdhip64.*not found", 50, "error names amdhip64 missing"), + (r"could not find hip", 40, "error mentions HIP not found"), + (r"hip_path.*not set", 35, "user mentions HIP_PATH unset"), + (r"hipinfo.*not recognized", 45, "Windows says hipInfo is not a command"), +] + +KEYWORDS_MSVC_REDIST = [ + (r"vcruntime140(?:_1)?\.dll", 50, "error mentions vcruntime140 / vcruntime140_1"), + (r"api-ms-win-crt-.*\.dll", 35, "error mentions api-ms-win-crt-* DLL"), + (r"the (program|application) can't start because", 25, "Windows missing-DLL dialog text"), + (r"msvcp140\.dll", 30, "error mentions msvcp140.dll"), +] + +KEYWORDS_REPO_BROKEN = [ + (r"404.*repo\.radeon\.com", 50, "404 against repo.radeon.com"), + (r"release file (is )?not (yet )?valid", 30, "apt 'release file not valid'"), + (r"the following packages have unmet dependencies", 25, "apt unmet dependencies"), + (r"unable to locate package rocm", 35, "apt cannot find ROCm package"), +] + +KEYWORDS_CONTAINER = [ + (r"hsa_status_error.*permission", 20, "HSA permission error (often container)"), + (r"/dev/dri.*permission", 30, "/dev/dri permission failure"), + (r"failed to open device", 25, "device open failure"), +] + +KEYWORDS_IOMMU_HANG = [ + (r"hang", 20, "user mentions 'hang'"), + (r"deadlock", 20, "user mentions deadlock"), + (r"timed out waiting", 25, "ring/queue timeout"), + (r"iommu", 30, "user mentions iommu"), +] + +KEYWORDS_DPKG_BROKEN = [ + (r"half[- ]configured", 50, "dpkg 'half-configured'"), + (r"dkms .*failed", 45, "DKMS build failure"), + (r"dpkg: error", 25, "generic dpkg error"), + (r"sub-process /usr/bin/dpkg returned", 25, "apt mentions dpkg failure"), + (r"--accept-eula", 40, "user mentions --accept-eula"), +] + +KEYWORDS_PAGE_FAULT = [ + (r"page fault", 40, "user mentions page fault"), + (r"vm_fault", 35, "kernel vm_fault"), + (r"hw_fault", 30, "amdgpu HW fault"), + (r"out_of_registers", 30, "compiler OUT_OF_REGISTERS"), +] + + +def _keyword_score(symptom: str, table: list[tuple[str, int, str]]) -> tuple[int, list[str]]: + """Return (score, evidence_lines) for the strongest matches in `table`. + + We DO NOT sum every match: a long error string mentioning the same + underlying problem in two ways shouldn't double-count. Instead we take + the top two distinct hits and sum those. That keeps signal strong but + bounded. + """ + if not symptom: + return 0, [] + sym = symptom.lower() + hits: list[tuple[int, str]] = [] + for pattern, weight, label in table: + if re.search(pattern, sym): + hits.append((weight, label)) + if not hits: + return 0, [] + hits.sort(reverse=True) + top = hits[:2] + return sum(h[0] for h in top), [h[1] for h in top] + + +# --------------------------------------------------------------------------- +# Examination accessors. The script accepts either the dict that +# `examine.py --json` emits OR a Python dict the agent has constructed by +# hand. We avoid pulling in the dataclass module here to keep diagnose.py +# usable standalone. +# --------------------------------------------------------------------------- + +def _g(exam: dict, *path: str, default: Any = None) -> Any: + """Safe nested-key getter.""" + cur: Any = exam + for p in path: + if not isinstance(cur, dict): + return default + if p not in cur: + return default + cur = cur[p] + return cur if cur is not None else default + + +def _amd_gpus(exam: dict) -> list[dict]: + return [g for g in _g(exam, "gpus", default=[]) if isinstance(g, dict) and g.get("is_amd")] + + +def _amd_gfx_targets(exam: dict) -> list[str]: + return [g.get("gfx_target", "") for g in _amd_gpus(exam) if g.get("gfx_target")] + + +# --------------------------------------------------------------------------- +# Per-misconfiguration checkers +# +# Each `check_*` function returns a Diagnosis with score=0 to mean "not a +# match". `run_all_checks` filters those out. The MIN_SCORE_FOR_MATCH +# threshold then promotes the survivors to "we think this is it". +# --------------------------------------------------------------------------- + +def check_1_arch_not_in_wheel(exam: dict, symptom: str) -> Diagnosis: + """GPU gfx target not in the framework's build arch list.""" + score = 0 + evidence: list[str] = [] + + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_INVALID_ISA) + score += kw_score + evidence += kw_ev + + framework_arch = _g(exam, "framework_arch_list", default=[]) or [] + gfx_targets = _amd_gfx_targets(exam) + # Direct check: any AMD gfx target in the system that is NOT in the + # framework's arch list. This is the strongest possible signal. + missing = [t for t in gfx_targets if framework_arch and t not in framework_arch] + if framework_arch and gfx_targets: + if missing: + score += 55 + evidence.append( + f"GPU gfx target(s) {missing} not in framework arch list {framework_arch}" + ) + else: + # Strong negative: every GPU is covered. Push score down so a + # weak symptom keyword alone doesn't surface this diagnosis. + score -= 30 + evidence.append( + f"framework arch list {framework_arch} already includes GPU target(s) {gfx_targets}" + ) + + framework = _g(exam, "framework", default="") + if framework in ("pytorch", "llama-cpp") and not framework_arch and gfx_targets: + # We at least know there is a framework and a GPU; can't confirm + # without arch list, but the symptom keywords still apply. + evidence.append( + "Framework arch list unknown -- cannot confirm without " + "`python -c 'import torch; print(torch.cuda.get_arch_list())'`." + ) + + if score <= 0: + return Diagnosis(id="fix-1-arch", title="GPU gfx not in framework arch list", score=0) + + fix = Fix( + summary=( + "Reinstall the framework from a wheel index that includes this GPU's " + "gfx target. Use HSA_OVERRIDE_GFX_VERSION ONLY as a temporary " + "workaround when no native wheel exists." + ), + commands=[ + "# Recommended: PyTorch ROCm nightly that ships the gfx115x kernels.", + "pip uninstall -y torch torchvision torchaudio", + "pip install --pre torch torchvision torchaudio \\\n" + " --index-url https://download.pytorch.org/whl/nightly/rocm6.4", + "# llama.cpp: rebuild with AMDGPU_TARGETS set to this GPU's gfx.", + "# cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=", + ], + fix_id="fix-1-arch", + auto_applicable=False, + verify=( + "python -c \"import torch; print(torch.cuda.is_available(), " + "torch.cuda.get_arch_list())\"" + ), + notes=[ + "TheRock (rocm/TheRock) ships nightly per-gfx wheels and is the " + "preferred fallback when the official pytorch wheel index does " + "not yet cover your gfx target.", + ], + ) + return Diagnosis( + id="fix-1-arch", + title="GPU gfx target not in framework's build arch list", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_2_hsa_override_unneeded(exam: dict, symptom: str) -> Diagnosis: + """HSA_OVERRIDE_GFX_VERSION set on a GPU that now has native support.""" + env = _g(exam, "env", default={}) or {} + override = env.get("HSA_OVERRIDE_GFX_VERSION", "") + if not override: + return Diagnosis(id="fix-2-unset-override", title="HSA_OVERRIDE_GFX_VERSION set unnecessarily", score=0) + + score = 30 + evidence = [f"HSA_OVERRIDE_GFX_VERSION={override} is set in the current shell"] + + # Page faults are the classic late-binding symptom of an override that + # masks the real gfx. + pf_score, pf_ev = _keyword_score(symptom, KEYWORDS_PAGE_FAULT) + score += pf_score + evidence += pf_ev + dmesg = _g(exam, "dmesg_amdgpu_tail", default=[]) or [] + if any("page fault" in line.lower() for line in dmesg): + score += 20 + evidence.append("kernel ring shows amdgpu page faults") + + framework_arch = _g(exam, "framework_arch_list", default=[]) or [] + gfx_targets = _amd_gfx_targets(exam) + if framework_arch and gfx_targets and all(t in framework_arch for t in gfx_targets): + score += 25 + evidence.append( + f"every detected GPU target ({gfx_targets}) is in the framework arch " + f"list ({framework_arch}); the override is hiding the native gfx." + ) + + if _g(exam, "os_family", default="linux") == "windows": + fix = Fix( + summary="Clear HSA_OVERRIDE_GFX_VERSION (Windows) and use the native HIP SDK / wheel.", + commands=[ + "# Inspect the User and Machine env scopes:", + "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','User')", + "[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','Machine')", + "# Clear from the User scope (does NOT affect already-open shells):", + 'setx HSA_OVERRIDE_GFX_VERSION ""', + "# Or remove via System Properties -> Environment Variables.", + ], + fix_id="fix-2-unset-override", + auto_applicable=True, + verify=( + "powershell -NoProfile -Command " + "\"[Environment]::GetEnvironmentVariable('HSA_OVERRIDE_GFX_VERSION','User')\"" + ), + ) + else: + fix = Fix( + summary="Unset HSA_OVERRIDE_GFX_VERSION and use the native wheel.", + commands=[ + "unset HSA_OVERRIDE_GFX_VERSION", + "# Also remove it from ~/.bashrc / ~/.zshrc / ~/.profile if persisted.", + ], + fix_id="fix-2-unset-override", + auto_applicable=True, + verify=( + "env | grep HSA_OVERRIDE_GFX_VERSION || echo OK_UNSET; " + "python -c \"import torch; print(torch.cuda.is_available())\"" + ), + ) + return Diagnosis( + id="fix-2-unset-override", + title="HSA_OVERRIDE_GFX_VERSION set on a GPU that has a native wheel", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_3_rocm_kernel_unsupported(exam: dict, symptom: str) -> Diagnosis: + """ROCm <-> distro/kernel unsupported triple.""" + score = 0 + evidence: list[str] = [] + + kernel = _g(exam, "kernel_release", default="") + distro = _g(exam, "distro_id", default="") + distro_v = _g(exam, "distro_version", default="") + rocm_version = _g(exam, "rocm_version", default="") + amdgpu_loaded = _g(exam, "amdgpu_loaded", default=None) + + if rocm_version and amdgpu_loaded is False: + score += 30 + evidence.append( + f"ROCm {rocm_version} is installed but the amdgpu kernel module is not loaded; " + "this is typical when DKMS failed against an unsupported kernel." + ) + + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_DPKG_BROKEN) + if kw_ev and any("dkms" in e.lower() for e in kw_ev): + score += 30 + evidence += kw_ev + + if kernel and rocm_version: + # We do NOT hardcode a matrix here -- it's stale within months. + # The check is purely "you have ROCm + amdgpu didn't load"; the + # fix points the user at the live AMD matrix page. + pass + + if score <= 0: + return Diagnosis(id="fix-3-rocm-kernel", title="ROCm/distro/kernel triple unsupported", score=0) + + fix = Fix( + summary=( + "Cross-check your kernel/distro against the live AMD compatibility " + "matrix before reinstalling." + ), + commands=[ + f"# Current: kernel={kernel} distro={distro} {distro_v} rocm={rocm_version}", + "# Compare to the live AMD matrix:", + "# https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html", + "# If your kernel is above the supported range, install the HWE", + "# kernel that matches ROCm, or rerun amdgpu-install with --no-dkms.", + ], + fix_id="fix-3-rocm-kernel", + auto_applicable=False, + needs_reboot=True, + verify="lsmod | grep amdgpu && rocminfo | head -n 20", + ) + return Diagnosis( + id="fix-3-rocm-kernel", + title="ROCm version + distro/kernel form an unsupported triple", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_4_render_group(exam: dict, symptom: str) -> Diagnosis: + """User not in render/video groups, or /dev/kfd group is wrong.""" + score = 0 + evidence: list[str] = [] + + in_render = _g(exam, "in_render_group", default=None) + in_video = _g(exam, "in_video_group", default=None) + kfd = _g(exam, "kfd", default=None) or {} + if in_render is False: + score += 35 + evidence.append("user is NOT in the 'render' group") + if in_video is False: + score += 10 + evidence.append("user is NOT in the 'video' group") + if kfd.get("exists") is True and kfd.get("user_can_write") is False: + score += 25 + evidence.append( + f"/dev/kfd exists (mode {kfd.get('mode')}, group {kfd.get('owner_group')}) " + "but the current user can't write to it" + ) + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_KFD_PERMISSION) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-4-render-group", title="User missing render/video group", score=0) + + kfd_group = kfd.get("owner_group") or "render" + fix = Fix( + summary=f"Add the current user to '{kfd_group}' (and 'video' for safety) and log out/in.", + commands=[ + f"sudo usermod -a -G {kfd_group},video \"$USER\"", + ], + needs_sudo=True, + needs_relogin=True, + fix_id="fix-4-render-group", + auto_applicable=True, + verify="groups | tr ' ' '\\n' | grep -E '^(render|video)$' && ls -l /dev/kfd && rocminfo | head -n 5", + notes=[ + "Group membership only takes effect after a full re-login (or " + "reboot). `newgrp render` will give the current shell access " + "but not other terminals or services.", + ], + ) + return Diagnosis( + id="fix-4-render-group", + title="User not in render/video group (or /dev/kfd owned by the other group)", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_5_amdgpu_blacklisted(exam: dict, symptom: str) -> Diagnosis: + """amdgpu module not loaded or actively blacklisted.""" + score = 0 + evidence: list[str] = [] + + amdgpu_loaded = _g(exam, "amdgpu_loaded", default=None) + blacklisted = _g(exam, "amdgpu_blacklisted_in", default=[]) or [] + rocm_status = _g(exam, "rocminfo_status", default="") + secure_boot = _g(exam, "secure_boot", default="unknown") + + if blacklisted: + score += 55 + evidence.append(f"amdgpu is blacklisted in: {blacklisted}") + if amdgpu_loaded is False: + score += 35 + evidence.append("amdgpu module is not loaded") + if rocm_status == "not-loaded": + score += 25 + evidence.append("rocminfo says 'ROCk module is NOT loaded'") + if secure_boot == "enabled" and amdgpu_loaded is False: + score += 10 + evidence.append( + "Secure Boot is enabled and amdgpu didn't load -- DKMS modules " + "are often blocked until you sign them or disable Secure Boot." + ) + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_MODULE_NOT_LOADED) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-5-amdgpu-load", title="amdgpu not loaded", score=0) + + commands: list[str] = [] + if blacklisted: + for f in blacklisted: + commands.append(f"# Inspect & remove the blacklist line: sudo $EDITOR {f}") + commands.append("sudo update-initramfs -u # Debian/Ubuntu") + commands.append("sudo dracut -f # Fedora/RHEL") + commands.append("sudo modprobe amdgpu") + if secure_boot == "enabled": + commands.append( + "# Secure Boot is on; if amdgpu still won't load, the DKMS " + "module isn't signed. Sign it (mokutil) or disable Secure Boot." + ) + + fix = Fix( + summary="Remove amdgpu from any modprobe blacklist and load it.", + commands=commands, + needs_sudo=True, + needs_reboot=bool(blacklisted), + fix_id="fix-5-amdgpu-load", + auto_applicable=False, + verify="lsmod | grep amdgpu && rocminfo | head -n 5", + ) + return Diagnosis( + id="fix-5-amdgpu-load", + title="amdgpu kernel module not loaded (or blacklisted)", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_6_path_missing(exam: dict, symptom: str) -> Diagnosis: + """ROCm/HIP binaries not on PATH after install.""" + score = 0 + evidence: list[str] = [] + + os_family = _g(exam, "os_family", default="linux") + env_path = _g(exam, "env", default={}).get("PATH", "") + + if os_family == "windows": + sdk_path = _g(exam, "hip_sdk_path", default="") + hipinfo_present = _g(exam, "hipinfo_present", default=None) + bin_dir = f"{sdk_path}\\bin" if sdk_path else r"C:\Program Files\AMD\ROCm\\bin" + if sdk_path and hipinfo_present is False: + score += 50 + evidence.append(f"{sdk_path} exists but hipInfo.exe wasn't found in its bin directory") + if sdk_path and env_path and bin_dir.lower() not in env_path.lower(): + score += 20 + evidence.append(f"{bin_dir} is not in PATH") + else: + rocm_path = _g(exam, "rocm_path", default="") + rocminfo_present = _g(exam, "rocminfo_present", default=None) + bin_dir = f"{rocm_path}/bin" if rocm_path else "/opt/rocm/bin" + if rocm_path and rocminfo_present is False: + score += 50 + evidence.append(f"{rocm_path} exists but `rocminfo` is not on PATH") + if rocm_path and env_path and bin_dir not in env_path: + score += 20 + evidence.append(f"{bin_dir} is not in $PATH") + + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_PATH_MISSING) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-6-path", title="ROCm not on PATH", score=0) + + if os_family == "windows": + fix = Fix( + summary=f"Add {bin_dir} to your User PATH and reopen the shell.", + commands=[ + f'setx PATH "%PATH%;{bin_dir}"', + "# Or: System Properties -> Environment Variables -> Path -> Edit -> New.", + "# `setx` only affects NEW shells; close and reopen this terminal afterwards.", + ], + fix_id="fix-6-path", + auto_applicable=True, + verify=f'powershell -NoProfile -Command "& \\"{bin_dir}\\hipInfo.exe\\" | Select-Object -First 5"', + ) + else: + fix = Fix( + summary=f"Add {bin_dir} to PATH for this shell and persist in your shell rc.", + commands=[ + f"export PATH={bin_dir}:$PATH", + f"echo 'export PATH={bin_dir}:$PATH' >> ~/.bashrc # or ~/.zshrc", + ], + fix_id="fix-6-path", + auto_applicable=True, + verify="rocminfo | head -n 5 && hipcc --version", + ) + return Diagnosis( + id="fix-6-path", + title="ROCm/HIP binaries not on PATH after install", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_7_stale_repos(exam: dict, symptom: str) -> Diagnosis: + """Stale or conflicting APT/DNF repos from prior installer runs.""" + score = 0 + evidence: list[str] = [] + repos = _g(exam, "rocm_repos_seen", default=[]) or [] + # Two or more ROCm repo files is the usual smoking gun (often one from + # the old amdgpu-install pin and one from a fresh radeon.com line). + if len(repos) >= 2: + score += 40 + evidence.append( + f"{len(repos)} ROCm/AMDGPU repo files present: {repos}" + ) + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_REPO_BROKEN) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-7-stale-repos", title="Stale ROCm repos", score=0) + + commands = ["ls /etc/apt/sources.list.d/ | grep -iE 'rocm|amdgpu|radeon' || true"] + for r in repos: + commands.append(f"# sudo mv {r} {r}.bak # quarantine, do not delete yet") + commands.append("sudo apt update") + commands.append("# If apt now resolves, reinstall via the correct method only:") + commands.append("# amdgpu-install --usecase=rocm,hip --no-dkms # if you want amdgpu-install") + commands.append("# or use the distro packages exclusively") + fix = Fix( + summary=( + "Quarantine duplicate ROCm/AMDGPU repo files and resolve apt before " + "re-running any installer." + ), + commands=commands, + needs_sudo=True, + fix_id="fix-7-stale-repos", + auto_applicable=False, + verify="sudo apt update 2>&1 | tail -n 20", + ) + return Diagnosis( + id="fix-7-stale-repos", + title="Stale or conflicting APT/DNF repos from prior installer runs", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_8_wheel_rocm_mismatch(exam: dict, symptom: str) -> Diagnosis: + """Framework wheel built for a different ROCm major than the system.""" + score = 0 + evidence: list[str] = [] + os_family = _g(exam, "os_family", default="linux") + fw_rocm = _g(exam, "framework_rocm_version", default="") + if os_family == "windows": + sys_rocm = _g(exam, "hip_sdk_version", default="") + else: + sys_rocm = _g(exam, "rocm_version", default="") + + def _major(s: str) -> str | None: + m = re.search(r"(\d+)\.(\d+)", s) + return f"{m.group(1)}.{m.group(2)}" if m else None + + fw_major = _major(fw_rocm) + sys_major = _major(sys_rocm) + if fw_major and sys_major and fw_major != sys_major: + score += 50 + runtime = "HIP SDK" if os_family == "windows" else "ROCm" + evidence.append( + f"Framework links HIP {fw_major} but system {runtime} is {sys_major}" + ) + + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_LIB_MISMATCH) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-8-wheel-rocm", title="Wheel/ROCm mismatch", score=0) + + if os_family == "windows": + fix = Fix( + summary=( + "Reinstall the framework against the HIP SDK major you have " + "installed (or install the HIP SDK major the wheel needs)." + ), + commands=[ + "pip uninstall -y torch torchvision torchaudio", + "# TheRock publishes Windows ROCm wheels per HIP SDK release:", + "# https://github.com/ROCm/TheRock", + "# Match the wheel index to the HIP SDK major you have on disk.", + "python -c \"import torch; print(torch.__version__, torch.version.hip)\"", + ], + fix_id="fix-8-wheel-rocm", + auto_applicable=False, + verify="python -c \"import torch; print(torch.cuda.is_available(), torch.version.hip)\"", + ) + else: + fix = Fix( + summary=( + "Reinstall the framework from the wheel index that matches the " + "system ROCm major (or upgrade the system ROCm to match the wheel)." + ), + commands=[ + "pip uninstall -y torch torchvision torchaudio", + "# Pick the index that matches your system ROCm major. Examples:", + "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4", + "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3", + "# Then re-check:", + "python -c \"import torch; print(torch.__version__, torch.version.hip)\"", + ], + fix_id="fix-8-wheel-rocm", + auto_applicable=False, + verify="python -c \"import torch; print(torch.cuda.is_available(), torch.version.hip)\"", + ) + return Diagnosis( + id="fix-8-wheel-rocm", + title="Framework wheel built for a different ROCm major than the system", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_9_igpu_dgpu_collision(exam: dict, symptom: str) -> Diagnosis: + """iGPU enumerated alongside dGPU and crashing the runtime.""" + has_apu = _g(exam, "has_apu", default=False) + has_discrete = _g(exam, "has_discrete_amd", default=False) + if not (has_apu and has_discrete): + return Diagnosis(id="fix-9-igpu-dgpu", title="iGPU+dGPU collision", score=0) + + env = _g(exam, "env", default={}) or {} + visible = env.get("HIP_VISIBLE_DEVICES") or env.get("ROCR_VISIBLE_DEVICES") + score = 40 + evidence = ["machine has both an AMD APU and an AMD discrete GPU"] + if not visible: + score += 25 + evidence.append("HIP_VISIBLE_DEVICES is unset; runtime sees BOTH GPUs") + # Crashes are vague but a crash on a dual-GPU box is the classic signal. + if symptom and re.search(r"(crash|segfault|signal 11)", symptom, re.IGNORECASE): + score += 15 + evidence.append("user mentions a crash / segfault") + + gfx_targets = _amd_gfx_targets(exam) + if _g(exam, "os_family", default="linux") == "windows": + fix = Fix( + summary=( + "Pin the HIP runtime to the discrete GPU with HIP_VISIBLE_DEVICES " + "so the iGPU is hidden." + ), + commands=[ + "# Confirm which index is the dGPU (hipInfo.exe output order):", + '& "$env:HIP_PATH\\bin\\hipInfo.exe" | Select-String "device#|Name|gcnArchName"', + "# Then persist HIP_VISIBLE_DEVICES in the User environment:", + "setx HIP_VISIBLE_DEVICES 1", + "# `setx` only takes effect in NEW shells; reopen the terminal.", + ], + fix_id="fix-9-igpu-dgpu", + auto_applicable=True, + verify=( + 'powershell -NoProfile -Command "$env:HIP_VISIBLE_DEVICES=1; ' + 'python -c \\"import torch; print(torch.cuda.device_count())\\""' + ), + notes=[ + f"Detected gfx targets: {gfx_targets}. The dGPU is usually the higher-numbered family (gfx11xx).", + ], + ) + else: + fix = Fix( + summary=( + "Pin the runtime to the discrete GPU with HIP_VISIBLE_DEVICES " + "so the iGPU is hidden." + ), + commands=[ + "# Confirm which index is the dGPU (`rocminfo` output order):", + "rocminfo | grep -E 'Agent |gfx|Marketing'", + "# Then pin HIP to the dGPU (typically index 1 when an APU is index 0):", + "export HIP_VISIBLE_DEVICES=1", + "# Persist in your shell rc or your launch script.", + ], + fix_id="fix-9-igpu-dgpu", + auto_applicable=False, + verify="HIP_VISIBLE_DEVICES=1 python -c \"import torch; print(torch.cuda.device_count())\"", + notes=[ + f"Detected gfx targets: {gfx_targets}. The dGPU is usually the higher-numbered family (gfx11xx).", + ], + ) + return Diagnosis( + id="fix-9-igpu-dgpu", + title="iGPU enumerated alongside dGPU and destabilising the runtime", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_10_container_devices(exam: dict, symptom: str) -> Diagnosis: + """Container can't see /dev/kfd or /dev/dri/renderD*.""" + in_container = _g(exam, "in_container", default=False) + if not in_container: + return Diagnosis(id="fix-10-container", title="Container missing devices", score=0) + + score = 25 + evidence = [f"running inside a {_g(exam, 'container_kind', default='container')}"] + kfd = _g(exam, "kfd", default=None) or {} + if kfd.get("exists") is False: + score += 40 + evidence.append("/dev/kfd is not present in the container") + elif kfd.get("user_can_write") is False: + score += 30 + evidence.append("/dev/kfd is present but not writable by the container user") + if not _g(exam, "render_devices", default=[]): + score += 20 + evidence.append("no /dev/dri/renderD* visible in the container") + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_CONTAINER) + score += kw_score + evidence += kw_ev + + fix = Fix( + summary=( + "Re-launch the container with the AMD devices and the render group " + "passed through." + ), + commands=[ + "# Docker / Podman flags AMD-recommends:", + "docker run --rm -it \\", + " --device=/dev/kfd \\", + " --device=/dev/dri \\", + " --group-add render \\", + " --security-opt seccomp=unconfined \\", + " --shm-size=8g \\", + " rocm/pytorch:latest", + "# Rootless podman: also pass `--userns=keep-id` and ensure the", + "# host user is in the render group; podman maps it through.", + ], + fix_id="fix-10-container", + auto_applicable=False, + verify="rocminfo | head -n 5", + notes=[ + "Use rocm/pytorch or rocm/dev-ubuntu-22.04 as a known-good image. " + "Mixing host ROCm + container ROCm versions is a separate footgun.", + ], + ) + return Diagnosis( + id="fix-10-container", + title="Container can't see /dev/kfd or /dev/dri/renderD*", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_11_iommu_hang(exam: dict, symptom: str) -> Diagnosis: + """Multi-GPU hang on systems with IOMMU enabled.""" + amd_count = len(_amd_gpus(exam)) + if amd_count < 2: + return Diagnosis(id="fix-11-iommu", title="Multi-GPU IOMMU hang", score=0) + + score = 0 + evidence = [f"{amd_count} AMD GPUs detected"] + iommu = _g(exam, "iommu_kernel_param", default="") + if iommu and iommu != "pt": + score += 25 + evidence.append(f"kernel cmdline has iommu={iommu} (not 'pt')") + if not iommu: + # IOMMU is on by default on most modern BIOSes even without the + # kernel cmdline flag. A multi-GPU hang is still the classic signal. + score += 10 + evidence.append("no iommu= flag on kernel cmdline (default may be 'on')") + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_IOMMU_HANG) + score += kw_score + evidence += kw_ev + + if score < 25: + return Diagnosis(id="fix-11-iommu", title="Multi-GPU IOMMU hang", score=0) + + fix = Fix( + summary=( + "Add `iommu=pt` to the kernel command line so DMA goes through " + "pass-through mode. This requires editing GRUB and rebooting." + ), + commands=[ + "# Inspect the current cmdline:", + "cat /proc/cmdline", + "# Edit /etc/default/grub and add iommu=pt to GRUB_CMDLINE_LINUX_DEFAULT:", + "sudo $EDITOR /etc/default/grub", + "sudo update-grub # Debian/Ubuntu", + "sudo grub2-mkconfig -o /boot/grub2/grub.cfg # Fedora/RHEL", + "# Reboot for the change to take effect, then retry the multi-GPU job.", + ], + needs_sudo=True, + needs_reboot=True, + fix_id="fix-11-iommu", + auto_applicable=False, + verify="cat /proc/cmdline | grep -o 'iommu=\\w*'", + ) + return Diagnosis( + id="fix-11-iommu", + title="Multi-GPU hang on systems with IOMMU enabled", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_12_amdgpu_install_broken(exam: dict, symptom: str) -> Diagnosis: + """amdgpu-install left a broken DKMS / repo state.""" + score = 0 + evidence: list[str] = [] + method = _g(exam, "rocm_install_method", default="") + if method == "amdgpu-install": + evidence.append("ROCm was installed via amdgpu-install") + else: + # Not a hard requirement; users sometimes hit this after the + # installer fails and they don't realize they did one. Don't add + # base score, but allow keyword evidence to count. + pass + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_DPKG_BROKEN) + score += kw_score + evidence += kw_ev + if method == "amdgpu-install" and kw_score > 0: + score += 20 + + if score <= 0: + return Diagnosis(id="fix-12-installer", title="amdgpu-install broken state", score=0) + + fix = Fix( + summary=( + "Run amdgpu-install's documented uninstall sequence to clear the " + "half-configured state, THEN reinstall without the flag that broke it." + ), + commands=[ + "sudo amdgpu-install --uninstall", + "sudo apt autoremove --purge -y", + "sudo apt update", + "# Reinstall. Drop --accept-eula if you used it previously; the", + "# newer installer rejects it and leaves a half-configured repo.", + "sudo amdgpu-install --usecase=rocm,hip", + ], + needs_sudo=True, + needs_reboot=True, + fix_id="fix-12-installer", + auto_applicable=False, + verify="dpkg -l | grep -E 'rocm|amdgpu' | head -n 20 && rocminfo | head -n 5", + notes=[ + "If `apt autoremove` warns it will remove unrelated packages, stop " + "and resolve those by hand before continuing.", + ], + ) + return Diagnosis( + id="fix-12-installer", + title="amdgpu-install left a broken state (repo regression / partial DKMS)", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_13_hip_sdk_missing(exam: dict, symptom: str) -> Diagnosis: + """Windows: framework imports HIP but the HIP SDK isn't installed.""" + if _g(exam, "os_family", default="") != "windows": + return Diagnosis(id="fix-13-hip-sdk-missing", title="HIP SDK not installed", score=0) + + score = 0 + evidence: list[str] = [] + sdk_path = _g(exam, "hip_sdk_path", default="") + hipinfo_present = _g(exam, "hipinfo_present", default=None) + framework = _g(exam, "framework", default="") + fw_rocm = _g(exam, "framework_rocm_version", default="") + has_amd = _g(exam, "has_amd_gpu", default=False) + + if not sdk_path: + score += 35 + evidence.append("No HIP SDK install found under C:\\Program Files\\AMD\\ROCm") + elif hipinfo_present is False: + score += 30 + evidence.append(f"HIP SDK at {sdk_path} but hipInfo.exe is missing from its bin directory") + + if has_amd and framework == "pytorch" and fw_rocm.startswith("hip="): + score += 25 + evidence.append("PyTorch is a HIP build but the HIP SDK is not present on this host") + + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_HIP_SDK_MISSING) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-13-hip-sdk-missing", title="HIP SDK not installed", score=0) + + fix = Fix( + summary=( + "Install the AMD HIP SDK for Windows; the HIP runtime DLLs and " + "hipInfo.exe come from there." + ), + commands=[ + "# Download and install the HIP SDK (matched to your framework's HIP major):", + "# https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html", + "# After install, reopen the shell so HIP_PATH and PATH pick up the new install.", + ], + fix_id="fix-13-hip-sdk-missing", + auto_applicable=False, + verify=( + 'powershell -NoProfile -Command ' + '"& \\"$env:HIP_PATH\\bin\\hipInfo.exe\\" | Select-Object -First 5"' + ), + notes=[ + "If you only need PyTorch on Windows AMD and don't need the C/C++ " + "HIP toolchain, the TheRock wheels bundle their own HIP runtime " + "and may not require a system HIP SDK install.", + ], + ) + return Diagnosis( + id="fix-13-hip-sdk-missing", + title="HIP SDK not installed (Windows)", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_14_adrenalin_too_old(exam: dict, symptom: str) -> Diagnosis: + """Windows: HIP SDK present but Adrenalin (kernel-mode driver) is too old. + + We deliberately do NOT hardcode a minimum Adrenalin version here: AMD + bumps the HIP SDK <-> Adrenalin pairing every release, and the live + table goes stale within months. Instead we trigger on observable + failure patterns (HIP SDK present + hipInfo unable to enumerate, or + user pasted a 'Driver version too old' style symptom) and route the + user to the live release notes. + """ + if _g(exam, "os_family", default="") != "windows": + return Diagnosis(id="fix-14-adrenalin-too-old", title="Adrenalin driver too old", score=0) + + score = 0 + evidence: list[str] = [] + sdk_path = _g(exam, "hip_sdk_path", default="") + hipinfo_present = _g(exam, "hipinfo_present", default=False) + hipinfo_status = _g(exam, "hipinfo_status", default="") + adrenalin = _g(exam, "adrenalin_version", default="") + + if sdk_path and hipinfo_present and hipinfo_status not in ("ok", ""): + score += 35 + evidence.append( + f"HIP SDK at {sdk_path} is installed but hipInfo.exe reports {hipinfo_status!r}; " + "this typically means the kernel-mode driver doesn't match the SDK." + ) + if adrenalin: + evidence.append(f"Adrenalin / kernel-mode driver version: {adrenalin}") + + if symptom and re.search(r"driver.*(too old|out of date|unsupported)", symptom, re.IGNORECASE): + score += 35 + evidence.append("error mentions 'driver too old / out of date / unsupported'") + if symptom and re.search(r"hsa.*invalid agent|no agents (were )?found", symptom, re.IGNORECASE): + score += 25 + evidence.append("HSA error suggests driver/runtime can't enumerate the GPU") + + if score <= 0: + return Diagnosis(id="fix-14-adrenalin-too-old", title="Adrenalin driver too old", score=0) + + fix = Fix( + summary=( + "Update the AMD Adrenalin (or PRO) graphics driver to the version " + "the HIP SDK release notes call out as the supported pairing." + ), + commands=[ + "# Cross-check the HIP SDK release notes for the exact driver pairing:", + "# https://rocm.docs.amd.com/projects/install-on-windows/en/latest/install/install.html", + "# Then download the matching driver from:", + "# https://www.amd.com/en/support", + "# Reboot after the install for the kernel-mode driver to take effect.", + ], + needs_reboot=True, + fix_id="fix-14-adrenalin-too-old", + auto_applicable=False, + verify=( + 'powershell -NoProfile -Command ' + '"(Get-CimInstance Win32_VideoController | ' + "Where-Object { $_.Name -like '*AMD*' -or $_.Name -like '*Radeon*' } | " + 'Select-Object -First 1).DriverVersion"' + ), + ) + return Diagnosis( + id="fix-14-adrenalin-too-old", + title="Adrenalin / kernel-mode driver too old for the installed HIP SDK", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +def check_15_msvc_redist(exam: dict, symptom: str) -> Diagnosis: + """Windows: MSVC runtime DLL missing -- HIP DLLs fail to load.""" + if _g(exam, "os_family", default="") != "windows": + return Diagnosis(id="fix-15-msvc-redist", title="MSVC runtime missing", score=0) + + score = 0 + evidence: list[str] = [] + redist = _g(exam, "msvc_redist_present", default=None) + if redist is False: + score += 45 + evidence.append("vcruntime140.dll / vcruntime140_1.dll not resolvable on PATH") + + kw_score, kw_ev = _keyword_score(symptom, KEYWORDS_MSVC_REDIST) + score += kw_score + evidence += kw_ev + + if score <= 0: + return Diagnosis(id="fix-15-msvc-redist", title="MSVC runtime missing", score=0) + + fix = Fix( + summary=( + "Install the Microsoft Visual C++ 2015-2022 redistributable so the " + "HIP SDK's amdhip64_*.dll can load." + ), + commands=[ + "# Download & install (x64):", + "# https://aka.ms/vs/17/release/vc_redist.x64.exe", + "# After the install, reopen the shell and re-run your import / hipInfo check.", + ], + fix_id="fix-15-msvc-redist", + auto_applicable=False, + verify=( + "where vcruntime140.dll && where vcruntime140_1.dll" + ), + notes=[ + "If installing the redistributable still leaves a missing-DLL error, " + "the failing DLL is probably amdhip64_X.dll itself; that points at " + "fix-13-hip-sdk-missing (the HIP SDK install) rather than this fix.", + ], + ) + return Diagnosis( + id="fix-15-msvc-redist", + title="MSVC runtime missing (HIP DLLs cannot load)", + score=min(score, 100), + evidence=evidence, + fix=fix, + ) + + +# Each entry: (checker, frozenset of os_family values it applies to). +# `run_all_checks` reads the running OS from the exam JSON and skips +# checkers whose `applicable_on` doesn't include it. This keeps the OS +# branching in one place rather than scattered through the checkers. +CHECKERS: list[tuple[Callable[[dict, str], Diagnosis], frozenset[str]]] = [ + (check_1_arch_not_in_wheel, frozenset({"linux", "windows"})), + (check_2_hsa_override_unneeded, frozenset({"linux", "windows"})), + (check_3_rocm_kernel_unsupported, frozenset({"linux"})), + (check_4_render_group, frozenset({"linux"})), + (check_5_amdgpu_blacklisted, frozenset({"linux"})), + (check_6_path_missing, frozenset({"linux", "windows"})), + (check_7_stale_repos, frozenset({"linux"})), + (check_8_wheel_rocm_mismatch, frozenset({"linux", "windows"})), + (check_9_igpu_dgpu_collision, frozenset({"linux", "windows"})), + (check_10_container_devices, frozenset({"linux"})), + (check_11_iommu_hang, frozenset({"linux"})), + (check_12_amdgpu_install_broken, frozenset({"linux"})), + (check_13_hip_sdk_missing, frozenset({"windows"})), + (check_14_adrenalin_too_old, frozenset({"windows"})), + (check_15_msvc_redist, frozenset({"windows"})), +] + + +def run_all_checks(exam: dict, symptom: str) -> list[Diagnosis]: + """Run every applicable checker, drop zero-score results, sort by score desc. + + Checkers whose `applicable_on` set doesn't include the running OS are + skipped silently (they were never going to score against this exam). + """ + os_family = _g(exam, "os_family", default="linux") + results: list[Diagnosis] = [] + for fn, applicable_on in CHECKERS: + if os_family not in applicable_on: + continue + try: + d = fn(exam, symptom or "") + except Exception as exc: # checker bug should not kill diagnose + results.append(Diagnosis( + id=f"checker-error-{fn.__name__}", + title=f"Internal checker error in {fn.__name__}", + score=0, + evidence=[f"{type(exc).__name__}: {exc}"], + )) + continue + if d.score > 0: + results.append(d) + results.sort(key=lambda d: d.score, reverse=True) + return results + + +# --------------------------------------------------------------------------- +# Output +# --------------------------------------------------------------------------- + +def _route_when_no_match(exam: dict) -> dict: + """Pick the right upstream tracker for the user's framework.""" + fw = _g(exam, "framework", default="unknown") + target = { + "pytorch": "pytorch", + "llama-cpp": "llama-cpp", + "lemonade": "lemonade", + "ollama": "ollama", + "lm-studio": "lm-studio", + }.get(fw, "rocm-core") + return {"target": target, "url": UPSTREAM_TRACKERS[target]} + + +def _print_human(diagnoses: list[Diagnosis], exam: dict, top: int) -> None: + if not diagnoses: + route = _route_when_no_match(exam) + print("rocm-doctor: no known misconfiguration matched.") + print() + print( + "This is the explicit 'I don't recognise this failure mode' case. " + "Do not speculate; file the symptom + this examination output upstream:" + ) + print(f" {route['target']:>12s}: {route['url']}") + print() + print("Include the JSON from `python scripts/examine.py --json` in your report.") + return + + for i, d in enumerate(diagnoses[:top], 1): + tier = "HIGH" if d.score >= HIGH_CONFIDENCE else ( + "LIKELY" if d.score >= MIN_SCORE_FOR_MATCH else "WEAK" + ) + print(f"#{i} [{tier} score={d.score}/100] {d.title}") + print(f" id: {d.id}") + for e in d.evidence: + print(f" - {e}") + if d.fix: + print(f" plan: {d.fix.summary}") + for c in d.fix.commands: + print(f" $ {c}") + flags = [] + if d.fix.needs_sudo: flags.append("sudo") + if d.fix.needs_reboot: flags.append("reboot required") + if d.fix.needs_relogin: flags.append("re-login required") + if d.fix.auto_applicable: flags.append("apply_fix.py can run it") + if flags: + print(f" flags: {', '.join(flags)}") + for n in d.fix.notes: + print(f" note: {n}") + if d.fix.verify: + print(f" verify after fix: {d.fix.verify}") + print() + + high = [d for d in diagnoses if d.score >= HIGH_CONFIDENCE] + if high: + print(f"Next step: propose `apply_fix.py --fix-id {high[0].id}` to the user.") + else: + print( + "Highest-scoring match is below the HIGH_CONFIDENCE threshold. " + "Confirm one more piece of evidence with the user before applying." + ) + + +def _to_jsonable(diagnoses: list[Diagnosis], exam: dict) -> dict: + return { + "matched": [asdict(d) for d in diagnoses], + "min_score_for_match": MIN_SCORE_FOR_MATCH, + "high_confidence_threshold": HIGH_CONFIDENCE, + "route_when_no_match": _route_when_no_match(exam), + } + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--exam", type=Path, required=True, + help="Path to the JSON produced by `examine.py --json`.", + ) + parser.add_argument( + "--symptom", default="", + help="Raw error text from the user; symptom-keyword scoring uses it.", + ) + parser.add_argument( + "--top", type=int, default=5, + help="Show at most this many matching diagnoses (default 5).", + ) + parser.add_argument("--json", action="store_true", + help="Emit machine-readable JSON instead of the human view.") + args = parser.parse_args(argv) + + if not args.exam.exists(): + print(f"exam file not found: {args.exam}", file=sys.stderr) + return 2 + try: + exam = json.loads(args.exam.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + print(f"exam file is not valid JSON: {exc}", file=sys.stderr) + return 2 + + diagnoses = run_all_checks(exam, args.symptom) + matched = [d for d in diagnoses if d.score >= MIN_SCORE_FOR_MATCH] + if args.json: + print(json.dumps(_to_jsonable(diagnoses, exam), indent=2)) + else: + _print_human(diagnoses, exam, args.top) + return 0 if matched else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/rocm-doctor/scripts/examine.py b/skills/rocm-doctor/scripts/examine.py new file mode 100644 index 0000000..844dc98 --- /dev/null +++ b/skills/rocm-doctor/scripts/examine.py @@ -0,0 +1,1268 @@ +#!/usr/bin/env -S uv run --quiet +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +"""Read-only system examination for the `rocm-doctor` skill. + +This is the first script the skill runs once it has decided the user's +framework actually touches the system ROCm install (so: PyTorch, llama.cpp, +and anything built against `/opt/rocm` on Linux or the HIP SDK on Windows, +but NOT Lemonade / LM Studio / Ollama, which ship their own runtime). + +The script collects the minimum set of facts needed to disambiguate +every known misconfiguration in `reference.md`. It never installs or +removes packages, never changes group membership, and never edits files. + +Supported platforms: + - Linux (native): full Linux probe set. + - Windows: HIP SDK + Adrenalin probes (no /sys, no rocminfo; uses + Win32_VideoController and hipInfo.exe instead). + - WSL2: detected and refused with a route-out message. The ROCm-on-WSL + flow needs Adrenalin Pro + the WSL kernel update on the Windows host + and is not in this catalog. + +Exit codes: + 0 = examination ran; results emitted. The agent should pass the JSON to + `diagnose.py` next. + 2 = wrong platform (WSL, neither Linux nor Windows, or no AMD GPU). The + agent should stop and route the user instead of running diagnose. + 3 = examination ran but something prevented a key probe from completing + and the agent should warn the user before continuing. + +Usage: + python scripts/examine.py + python scripts/examine.py --json + python scripts/examine.py --framework pytorch + python scripts/examine.py --framework llama-cpp --json + +The optional `--framework` flag scopes the framework-specific probes +(e.g. running PyTorch's `torch.version.hip`). When omitted the script +probes everything it can detect without launching a Python interpreter +for a framework that may not be installed. +""" + +from __future__ import annotations + +import argparse +import json +import os +import platform +import re +import shutil +import stat +import subprocess +import sys +from dataclasses import asdict, dataclass, field +from pathlib import Path + +# Environment variables that silently change ROCm/HIP behaviour. We record +# every one of these, even when empty, so `diagnose.py` can see both the +# value and the fact that it is unset (which is itself a signal for some +# misconfigurations -- e.g. ROCM_PATH being unset is fine, but the user +# having set HSA_OVERRIDE_GFX_VERSION on a supported GPU is suspicious). +TRACKED_ENV_VARS = ( + "HSA_OVERRIDE_GFX_VERSION", + "HIP_VISIBLE_DEVICES", + "ROCR_VISIBLE_DEVICES", + "CUDA_VISIBLE_DEVICES", # PyTorch HIP also honours this name. + "GPU_DEVICE_ORDINAL", + "ROCM_PATH", + "ROCM_HOME", + "HIP_PATH", # Windows: HIP SDK install root (e.g. C:\Program Files\AMD\ROCm\6.4\). + "HIP_PLATFORM", # Windows: usually "amd"; "nvidia" means user is on the wrong toolchain. + "PYTORCH_ROCM_ARCH", + "HCC_AMDGPU_TARGET", + "AMDGPU_TARGETS", + "LD_LIBRARY_PATH", + "PATH", +) + +# Files the amdgpu-install pipeline drops on APT-based systems. Presence +# of these tells us "installed via amdgpu-install", absence + apt-installed +# ROCm packages tells us "installed via plain apt", and absence of both +# with a populated /opt/rocm typically means "tarball or pip wheel". +AMDGPU_INSTALL_MARKERS = ( + "/etc/apt/sources.list.d/amdgpu.list", + "/etc/apt/sources.list.d/rocm.list", + "/etc/apt/sources.list.d/radeon.list", + "/etc/yum.repos.d/amdgpu.repo", + "/etc/yum.repos.d/rocm.repo", +) + +# Containers we can detect cheaply from /proc/1/cgroup or marker files. +CONTAINER_MARKERS = { + "/.dockerenv": "docker", + "/run/.containerenv": "podman", +} + + +@dataclass +class GPU: + name: str = "" + gfx_target: str = "" # e.g. gfx1151 + pci_id: str = "" + is_apu: bool | None = None + is_amd: bool = False + + +@dataclass +class Device: + path: str + exists: bool + mode: str = "" # e.g. "crw-rw----" + owner_user: str = "" + owner_group: str = "" + user_can_read: bool | None = None + user_can_write: bool | None = None + + +@dataclass +class Examination: + # --- platform --- + os_family: str = "unknown" # linux | windows | other + os_version: str = "" + distro_id: str = "" # ubuntu, debian, rhel, fedora, ... + distro_version: str = "" + kernel_release: str = "" + kernel_cmdline: str = "" + is_wsl: bool = False # True iff running inside WSL2 (out of scope; see notes). + + # --- hardware --- + cpu_vendor: str = "unknown" + cpu_model: str = "" + gpus: list[GPU] = field(default_factory=list) + has_amd_gpu: bool = False + has_nvidia_gpu: bool = False + has_apu: bool = False + has_discrete_amd: bool = False + + # --- driver / runtime (Linux) --- + amdgpu_loaded: bool | None = None + amdgpu_blacklisted_in: list[str] = field(default_factory=list) + amdkfd_loaded: bool | None = None + secure_boot: str = "unknown" # enabled | disabled | unknown + iommu_kernel_param: str = "" # value of iommu=, empty if unset + kfd: Device | None = None + render_devices: list[Device] = field(default_factory=list) + + # --- user / groups (Linux) --- + user_name: str = "" + user_groups: list[str] = field(default_factory=list) + in_render_group: bool | None = None + in_video_group: bool | None = None + + # --- ROCm install (Linux) --- + rocm_version: str = "" # e.g. 6.4.1 + rocm_install_method: str = "" # amdgpu-install | apt | dnf | pip-only | unknown | none + rocm_path: str = "" # /opt/rocm typically + rocminfo_present: bool = False + rocminfo_status: str = "" # ok | not-loaded | permission-denied | missing + hip_libs_on_ld_path: bool | None = None + rocm_repos_seen: list[str] = field(default_factory=list) + + # --- HIP SDK install (Windows) --- + hip_sdk_path: str = "" # e.g. C:\Program Files\AMD\ROCm\6.4\ + hip_sdk_version: str = "" # e.g. 6.4 (parsed from the install dir) + hipinfo_present: bool = False + hipinfo_status: str = "" # ok | error rc=N | missing + adrenalin_version: str = "" # Win32_VideoController.DriverVersion (e.g. 32.0.11020.5) + msvc_redist_present: bool | None = None # vcruntime140 / vcruntime140_1 resolvable + + # --- framework --- + framework: str = "unknown" # pytorch | llama-cpp | unknown | skipped + framework_version: str = "" + framework_rocm_version: str = "" # e.g. PyTorch's torch.version.hip + framework_arch_list: list[str] = field(default_factory=list) + framework_notes: list[str] = field(default_factory=list) + + # --- environment --- + env: dict[str, str] = field(default_factory=dict) + + # --- container --- + in_container: bool = False + container_kind: str = "" + + # --- evidence captured for diagnose.py --- + dmesg_amdgpu_tail: list[str] = field(default_factory=list) + notes: list[str] = field(default_factory=list) + probe_failures: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Shell helpers (never raise) +# --------------------------------------------------------------------------- + +def _run(cmd: list[str], timeout: float = 5.0) -> tuple[int, str, str]: + """Run `cmd`; return (rc, stdout, stderr). Never raises.""" + try: + r = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, check=False, + ) + return r.returncode, r.stdout or "", r.stderr or "" + except (FileNotFoundError, subprocess.SubprocessError, OSError): + return 127, "", "" + + +def _read_text(path: str) -> str: + try: + return Path(path).read_text(encoding="utf-8", errors="replace") + except OSError: + return "" + + +def _have(cmd: str) -> bool: + return shutil.which(cmd) is not None + + +# --------------------------------------------------------------------------- +# Platform probes +# --------------------------------------------------------------------------- + +def _probe_os(e: Examination) -> None: + sysname = platform.system().lower() + e.os_version = platform.platform() + if sysname == "linux": + e.os_family = "linux" + e.kernel_release = platform.release() + e.kernel_cmdline = _read_text("/proc/cmdline").strip() + # /etc/os-release is the standard for distro identity since 2012. + osr = _read_text("/etc/os-release") + for line in osr.splitlines(): + if "=" not in line: + continue + k, v = line.split("=", 1) + v = v.strip().strip('"') + if k == "ID": + e.distro_id = v + elif k == "VERSION_ID": + e.distro_version = v + m = re.search(r"\biommu=(\w+)", e.kernel_cmdline) + if m: + e.iommu_kernel_param = m.group(1) + # WSL2 advertises itself in /proc/version and via the WSL_DISTRO_NAME + # env var. We treat WSL as out of scope -- the ROCm-on-WSL flow needs + # Adrenalin Pro + the WSL kernel update on the Windows host, not the + # native-Linux fixes in this catalog. + proc_version = _read_text("/proc/version").lower() + if "microsoft" in proc_version or "wsl" in proc_version or os.environ.get("WSL_DISTRO_NAME"): + e.is_wsl = True + elif sysname == "windows": + e.os_family = "windows" + else: + e.os_family = "other" + + +def _probe_cpu(e: Examination) -> None: + if e.os_family == "linux": + txt = _read_text("/proc/cpuinfo") + for line in txt.splitlines(): + if line.startswith("vendor_id") and not e.cpu_vendor or e.cpu_vendor == "unknown": + val = line.split(":", 1)[1].strip() + e.cpu_vendor = "amd" if "AMD" in val else ("intel" if "Intel" in val else val.lower()) + if line.startswith("model name") and not e.cpu_model: + e.cpu_model = line.split(":", 1)[1].strip() + if e.cpu_vendor != "unknown" and e.cpu_model: + break + elif e.os_family == "windows": + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "(Get-CimInstance Win32_Processor | Select-Object -First 1).Name", + ], timeout=8) + if rc == 0 and out.strip(): + e.cpu_model = out.strip().splitlines()[0].strip() + lname = e.cpu_model.lower() + e.cpu_vendor = "amd" if "amd" in lname else ("intel" if "intel" in lname else "unknown") + else: + e.probe_failures.append("Get-CimInstance Win32_Processor failed; cannot identify CPU.") + + +# --------------------------------------------------------------------------- +# GPU probes +# --------------------------------------------------------------------------- + +# Strix Halo / Phoenix / Hawk Point / Strix Point marketing names commonly +# seen in `lspci`. Used to flag the GPU as an APU when rocminfo isn't +# available. +_APU_KEYWORDS = ( + "strix halo", "ryzen ai max", "phoenix", "hawk point", "strix point", + "krackan", "rembrandt", "raphael", "barcelo", "lucienne", "renoir", + "cezanne", +) + + +def _classify_amd_marketing_name(name: str) -> tuple[str, bool]: + """Return (best-effort gfx_target, is_apu) for an AMD GPU marketing name. + + Falls back to ("", False) when we can't tell, in which case `rocminfo` + (Linux) or `hipInfo.exe` (Windows) output is the source of truth for + the gfx target. + """ + # Windows reports names like "AMD Radeon(TM) 8060S Graphics"; strip the + # (R)/(TM)/(C) decorations and collapse whitespace so substring matches + # ("radeon 8060s") don't get broken by them. + n = re.sub(r"\(\s*(?:tm|r|c)\s*\)", " ", name.lower()) + n = re.sub(r"\s+", " ", n).strip() + # Strix Halo iGPU shows up under three distinct names depending on host: + # the CPU package name on Linux ("Ryzen AI Max+ ..."), the iGPU adapter + # name on Windows ("AMD Radeon(TM) 8060S Graphics"), or the codename in + # docs ("Strix Halo"). All three map to gfx1151. + if "ryzen ai max" in n or "strix halo" in n: + return "gfx1151", True + if "radeon 8050s" in n or "radeon 8060s" in n or "radeon 8045s" in n: + return "gfx1151", True + if "radeon 880m" in n or "radeon 890m" in n or "strix point" in n or "krackan" in n: + return "gfx1150", True + if "radeon 780m" in n or "radeon 760m" in n or "radeon 740m" in n \ + or "phoenix" in n or "hawk point" in n: + return "gfx1103", True + return "", any(kw in n for kw in _APU_KEYWORDS) + + +def _probe_gpus_lspci(e: Examination) -> None: + """Enumerate AMD/NVIDIA display+3D controllers via lspci.""" + if not _have("lspci"): + e.probe_failures.append("lspci not found; cannot enumerate PCI GPUs") + return + rc, out, _ = _run(["lspci", "-nn", "-D"], timeout=8) + if rc != 0: + e.probe_failures.append("lspci returned non-zero; PCI enumeration incomplete") + return + for line in out.splitlines(): + # Match VGA, 3D, and Display controllers. + if not re.search(r"(VGA compatible controller|3D controller|Display controller)", line): + continue + pci_id = line.split()[0] if line.split() else "" + is_amd = "[1002" in line or "Advanced Micro Devices" in line or "AMD" in line + is_nvidia = "[10de" in line or "NVIDIA" in line + # Marketing name lives between the controller-kind colon and the + # `[vendor:device]` tail. + m = re.match( + r"\S+\s+(?:VGA compatible controller|3D controller|Display controller)" + r"\s*\[\w+\]:\s*(.+?)\s*\[[\da-f]{4}:[\da-f]{4}\]", + line, + re.IGNORECASE, + ) + name = m.group(1).strip() if m else line + if is_nvidia: + e.has_nvidia_gpu = True + e.gpus.append(GPU(name=name, pci_id=pci_id, is_amd=False, is_apu=False)) + continue + if not is_amd: + continue + gfx_guess, is_apu_guess = _classify_amd_marketing_name(name) + e.gpus.append(GPU( + name=name, gfx_target=gfx_guess, pci_id=pci_id, + is_apu=is_apu_guess, is_amd=True, + )) + + +def _probe_gpus_rocminfo(e: Examination) -> None: + """Refine the AMD GPU list with rocminfo's authoritative gfx targets. + + rocminfo's output is the ground truth for the LLVM gfx target the + runtime will load kernels for. When the binary is present but exits + non-zero we capture the failure (it's a signal in its own right -- + e.g. "ROCk module is NOT loaded" means amdkfd isn't loaded). + """ + if not _have("rocminfo"): + e.rocminfo_present = False + e.rocminfo_status = "missing" + return + e.rocminfo_present = True + rc, out, err = _run(["rocminfo"], timeout=15) + if rc != 0: + merged = (out + "\n" + err).lower() + if "rock module is not loaded" in merged: + e.rocminfo_status = "not-loaded" + elif "permission denied" in merged or "operation not permitted" in merged: + e.rocminfo_status = "permission-denied" + else: + e.rocminfo_status = f"error rc={rc}" + return + e.rocminfo_status = "ok" + + # Parse GPU agents. rocminfo blocks look like: + # Agent 2 + # Name: gfx1151 + # Marketing Name: AMD Radeon Graphics + # Device Type: GPU + gfx_targets: list[tuple[str, str]] = [] + cur_name = "" + cur_marketing = "" + cur_is_gpu = False + for line in out.splitlines(): + s = line.strip() + if s.startswith("Agent "): + if cur_is_gpu and cur_name.startswith("gfx"): + gfx_targets.append((cur_name, cur_marketing)) + cur_name = "" + cur_marketing = "" + cur_is_gpu = False + continue + if s.startswith("Name:"): + cur_name = s.split(":", 1)[1].strip() + elif s.startswith("Marketing Name:"): + cur_marketing = s.split(":", 1)[1].strip() + elif s.startswith("Device Type:"): + cur_is_gpu = "GPU" in s + if cur_is_gpu and cur_name.startswith("gfx"): + gfx_targets.append((cur_name, cur_marketing)) + + if not gfx_targets: + return + + # Reconcile with the lspci-derived list: prefer rocminfo's gfx target + # for any AMD entry that didn't already have one. + amd_entries = [g for g in e.gpus if g.is_amd] + for idx, (gfx, marketing) in enumerate(gfx_targets): + if idx < len(amd_entries): + amd_entries[idx].gfx_target = gfx + if marketing and not amd_entries[idx].name: + amd_entries[idx].name = marketing + # APU classification: gfx115x/gfx110x/gfx103x are APU families + # the doctor cares about. The rest are discrete. + amd_entries[idx].is_apu = bool(re.match(r"gfx11[05]\d", gfx)) + else: + e.gpus.append(GPU( + name=marketing or "AMD GPU", gfx_target=gfx, + is_amd=True, is_apu=bool(re.match(r"gfx11[05]\d", gfx)), + )) + + +def _summarise_gpu_categories(e: Examination) -> None: + e.has_amd_gpu = any(g.is_amd for g in e.gpus) + e.has_apu = any(g.is_amd and g.is_apu for g in e.gpus) + e.has_discrete_amd = any(g.is_amd and g.is_apu is False for g in e.gpus) + + +# --------------------------------------------------------------------------- +# Kernel module / device probes +# --------------------------------------------------------------------------- + +def _probe_modules(e: Examination) -> None: + if e.os_family != "linux": + return + rc, out, _ = _run(["lsmod"], timeout=5) + if rc == 0: + modules = {line.split()[0] for line in out.splitlines()[1:] if line.split()} + e.amdgpu_loaded = "amdgpu" in modules + e.amdkfd_loaded = "amdkfd" in modules + else: + # /proc/modules is always readable and is the source of truth for lsmod. + txt = _read_text("/proc/modules") + if txt: + modules = {line.split()[0] for line in txt.splitlines() if line.split()} + e.amdgpu_loaded = "amdgpu" in modules + e.amdkfd_loaded = "amdkfd" in modules + + # Blacklist files. We don't try to parse every modprobe.d directive + # perfectly; we just flag any file that contains a literal "blacklist + # amdgpu" line so the agent can ask the user to inspect it. + for d in ("/etc/modprobe.d", "/usr/lib/modprobe.d", "/run/modprobe.d"): + try: + for f in Path(d).glob("*.conf"): + body = _read_text(str(f)) + if re.search(r"^\s*blacklist\s+amdgpu\b", body, re.MULTILINE): + e.amdgpu_blacklisted_in.append(str(f)) + except OSError: + continue + + +def _probe_devices(e: Examination) -> None: + if e.os_family != "linux": + return + e.kfd = _stat_device("/dev/kfd") + try: + for path in sorted(Path("/dev/dri").glob("renderD*")): + e.render_devices.append(_stat_device(str(path))) + except OSError: + pass + + +def _stat_device(path: str) -> Device: + d = Device(path=path, exists=os.path.exists(path)) + if not d.exists: + return d + try: + st = os.stat(path) + except OSError as exc: + d.mode = f"stat failed: {exc}" + return d + d.mode = stat.filemode(st.st_mode) + # Resolve uid/gid to names via /etc/passwd & /etc/group; we do this by + # hand because the pwd / grp modules are unavailable inside `uv run` + # sandboxes on some systems. + d.owner_user = _uid_to_name(st.st_uid) + d.owner_group = _gid_to_name(st.st_gid) + d.user_can_read = os.access(path, os.R_OK) + d.user_can_write = os.access(path, os.W_OK) + return d + + +def _uid_to_name(uid: int) -> str: + try: + import pwd + return pwd.getpwuid(uid).pw_name + except (KeyError, ImportError, OSError): + return str(uid) + + +def _gid_to_name(gid: int) -> str: + try: + import grp + return grp.getgrgid(gid).gr_name + except (KeyError, ImportError, OSError): + return str(gid) + + +def _probe_user(e: Examination) -> None: + if e.os_family != "linux": + return + e.user_name = os.environ.get("USER") or os.environ.get("LOGNAME") or "" + rc, out, _ = _run(["id", "-Gn"], timeout=3) + if rc == 0: + e.user_groups = out.strip().split() + else: + # Fallback: scan /etc/group for our uid. + try: + import grp + uid = os.getuid() + import pwd + primary_gid = pwd.getpwuid(uid).pw_gid + e.user_groups = [ + g.gr_name for g in grp.getgrall() + if e.user_name in g.gr_mem or g.gr_gid == primary_gid + ] + except (ImportError, KeyError, OSError): + pass + e.in_render_group = "render" in e.user_groups + e.in_video_group = "video" in e.user_groups + + +def _probe_secure_boot(e: Examination) -> None: + if e.os_family != "linux": + return + if _have("mokutil"): + rc, out, _ = _run(["mokutil", "--sb-state"], timeout=3) + if rc == 0: + o = out.lower() + if "enabled" in o: + e.secure_boot = "enabled" + elif "disabled" in o: + e.secure_boot = "disabled" + + +# --------------------------------------------------------------------------- +# ROCm install probes +# --------------------------------------------------------------------------- + +def _probe_rocm_install(e: Examination) -> None: + if e.os_family != "linux": + return + # 1) Canonical install path. `/opt/rocm` is a symlink to /opt/rocm-X.Y.Z + # on every supported install pattern, including pip wheels that ship a + # bundled runtime (the wheel sets ROCM_PATH instead). + rocm_dir = "" + for candidate in ("/opt/rocm", os.environ.get("ROCM_PATH", "")): + if candidate and os.path.isdir(candidate): + rocm_dir = candidate + break + e.rocm_path = rocm_dir + + # 2) Version. Modern installs put a .info/version-* file; older ones + # only have it inside /opt/rocm-X.Y.Z. Walk both. + if rocm_dir: + for fname in ("version", "version-utils", "version-libs"): + f = Path(rocm_dir) / ".info" / fname + if f.exists(): + e.rocm_version = f.read_text(encoding="utf-8", errors="replace").strip() + break + if not e.rocm_version: + # /opt/rocm-X.Y.Z symlink target. + try: + real = os.path.realpath(rocm_dir) + m = re.search(r"rocm-(\d+(?:\.\d+)+)", real) + if m: + e.rocm_version = m.group(1) + except OSError: + pass + + # 3) Install method. We check in priority order: amdgpu-install repo + # files, packaged ROCm on the distro repos, and finally "looks like a + # pip wheel" if /opt/rocm doesn't exist but a torch wheel bundles HIP. + for marker in AMDGPU_INSTALL_MARKERS: + if os.path.exists(marker): + e.rocm_install_method = "amdgpu-install" + e.rocm_repos_seen.append(marker) + + if not e.rocm_install_method: + if _have("dpkg"): + rc, out, _ = _run(["dpkg", "-l", "rocm-hip-runtime"], timeout=8) + if rc == 0 and "rocm-hip-runtime" in out: + e.rocm_install_method = "apt" + if not e.rocm_install_method and _have("rpm"): + rc, out, _ = _run(["rpm", "-q", "rocm-hip-runtime"], timeout=8) + if rc == 0 and "rocm-hip-runtime" in out: + e.rocm_install_method = "dnf" + if not e.rocm_install_method: + if rocm_dir: + e.rocm_install_method = "tarball-or-other" + else: + e.rocm_install_method = "none" + + # 4) Stale repo detection: more than one ROCm repo file at the same + # time. Common after `amdgpu-install` reruns with different `--rocmrelease`. + extra = [] + try: + for d in ("/etc/apt/sources.list.d", "/etc/yum.repos.d"): + for f in Path(d).glob("*"): + if re.search(r"(rocm|amdgpu|radeon)", f.name, re.IGNORECASE): + extra.append(str(f)) + except OSError: + pass + # Deduplicate while preserving order. + for x in extra: + if x not in e.rocm_repos_seen: + e.rocm_repos_seen.append(x) + + +# --------------------------------------------------------------------------- +# Framework probes +# --------------------------------------------------------------------------- + +# Inline Python the PyTorch probe pipes into the user's interpreter. Kept +# tiny so it works even on Python interpreters with broken site-packages. +_PYTORCH_PROBE = ( + "import json,sys\n" + "out={'ok':False}\n" + "try:\n" + " import torch\n" + " out['ok']=True\n" + " out['version']=torch.__version__\n" + " out['hip']=getattr(torch.version,'hip',None)\n" + " out['cuda']=getattr(torch.version,'cuda',None)\n" + " out['is_available']=bool(torch.cuda.is_available())\n" + " try: out['device_count']=int(torch.cuda.device_count())\n" + " except Exception: out['device_count']=0\n" + " try: out['arch_list']=list(torch.cuda.get_arch_list())\n" + " except Exception: out['arch_list']=[]\n" + "except Exception as ex:\n" + " out['error']=type(ex).__name__+': '+str(ex)\n" + "sys.stdout.write(json.dumps(out))\n" +) + + +def _probe_pytorch(e: Examination) -> None: + """Try to introspect PyTorch in the user's default python.""" + py = sys.executable or shutil.which("python") or shutil.which("python3") + if not py: + e.framework_notes.append("No python interpreter found to probe torch.") + return + rc, out, err = _run([py, "-c", _PYTORCH_PROBE], timeout=20) + if rc != 0 or not out.strip(): + # Try `python3` explicitly in case `sys.executable` is uv's own env + # and the user's torch lives elsewhere. + py2 = shutil.which("python3") + if py2 and py2 != py: + rc, out, err = _run([py2, "-c", _PYTORCH_PROBE], timeout=20) + if not out.strip(): + e.framework_notes.append( + "Could not import torch; if PyTorch is in a venv, activate it " + "and re-run examine.py inside that venv." + ) + if err: + e.framework_notes.append(f"python stderr: {err.strip().splitlines()[-1][:200]}") + return + try: + data = json.loads(out.strip()) + except json.JSONDecodeError: + e.framework_notes.append(f"torch probe returned non-JSON: {out[:200]}") + return + if not data.get("ok"): + e.framework_notes.append(f"torch import failed: {data.get('error', 'unknown')}") + return + e.framework = "pytorch" + e.framework_version = data.get("version", "") + hip = data.get("hip") + cuda = data.get("cuda") + if hip: + e.framework_rocm_version = f"hip={hip}" + elif cuda: + e.framework_rocm_version = f"cuda={cuda}" + e.framework_notes.append( + "This torch wheel is a CUDA build, not a ROCm build. Reinstall " + "from the ROCm wheel index." + ) + arch = data.get("arch_list") or [] + e.framework_arch_list = [a for a in arch if isinstance(a, str)] + if data.get("is_available") is False: + e.framework_notes.append( + "torch.cuda.is_available() returned False -- runtime can't see a GPU." + ) + + +def _probe_llama_cpp(e: Examination) -> None: + """Best-effort probe of a llama.cpp build on PATH.""" + binary = None + for name in ("llama-cli", "llama-server", "main"): + p = shutil.which(name) + if p: + binary = p + break + if not binary: + e.framework_notes.append("No llama.cpp binary (llama-cli/llama-server/main) on PATH.") + return + rc, out, err = _run([binary, "--version"], timeout=10) + body = out + err + if rc != 0 and not body: + e.framework_notes.append(f"{binary} --version exited rc={rc}") + return + e.framework = "llama-cpp" + e.framework_version = body.strip().splitlines()[0][:200] if body.strip() else "unknown" + # Newer builds print "ROCm" or "HIP" in --version when GGML_HIP=ON. + if "HIP" in body or "ROCm" in body or "hipBLAS" in body: + e.framework_rocm_version = "GGML_HIP=ON" + else: + e.framework_notes.append( + "llama.cpp binary doesn't advertise HIP/ROCm support; was it built " + "with `cmake -DGGML_HIP=ON -DAMDGPU_TARGETS=`?" + ) + + +def _probe_framework(e: Examination, requested: str | None) -> None: + if requested == "skip": + e.framework = "skipped" + return + if requested == "pytorch": + _probe_pytorch(e) + return + if requested == "llama-cpp": + _probe_llama_cpp(e) + return + # Auto-detect: prefer PyTorch (the common case for the doctor), then + # llama.cpp. We don't probe both to keep the script fast and to avoid + # spawning a python interpreter when the user clearly meant llama.cpp. + py = sys.executable or shutil.which("python") or shutil.which("python3") + if py: + _probe_pytorch(e) + if e.framework == "pytorch": + return + _probe_llama_cpp(e) + + +# --------------------------------------------------------------------------- +# Misc probes +# --------------------------------------------------------------------------- + +def _probe_env(e: Examination) -> None: + for k in TRACKED_ENV_VARS: + v = os.environ.get(k) + if v is None: + continue + # Truncate enormous PATHs so JSON output stays human-scale. + if k in ("PATH", "LD_LIBRARY_PATH") and len(v) > 4000: + v = v[:4000] + "...[truncated]" + e.env[k] = v + # Quick check: does any path in LD_LIBRARY_PATH carry a libamdhip64? + ld = os.environ.get("LD_LIBRARY_PATH", "") + hits = [] + for d in ld.split(os.pathsep): + if not d: + continue + try: + for hit in Path(d).glob("libamdhip64*"): + hits.append(str(hit)) + break + except OSError: + continue + if hits: + e.hip_libs_on_ld_path = True + e.notes.append(f"libamdhip64 visible via LD_LIBRARY_PATH: {hits[0]}") + else: + e.hip_libs_on_ld_path = False if ld else None + + +def _probe_container(e: Examination) -> None: + for marker, kind in CONTAINER_MARKERS.items(): + if os.path.exists(marker): + e.in_container = True + e.container_kind = kind + return + cg = _read_text("/proc/1/cgroup") + if cg and any(x in cg for x in ("docker", "containerd", "lxc", "kubepods", "podman")): + e.in_container = True + e.container_kind = e.container_kind or "container" + + +def _probe_dmesg_amdgpu(e: Examination) -> None: + if e.os_family != "linux": + return + # We try journalctl first because it works for unprivileged users when + # the systemd journal is world-readable; dmesg is usually root-only on + # modern kernels (`kernel.dmesg_restrict=1`). + text = "" + rc, out, _ = _run(["journalctl", "-k", "--no-pager", "-n", "400"], timeout=8) + if rc == 0 and out: + text = out + else: + rc, out, _ = _run(["dmesg"], timeout=5) + if rc == 0: + text = out + if not text: + return + # Keep at most ~15 amdgpu/amdkfd lines as evidence so the JSON stays + # small. We prioritise lines containing well-known failure substrings. + interesting = ( + "page fault", "RAS Controller", "vm_fault", "amdgpu_device_init", + "OUT_OF_REGISTERS", "ring", "GPU reset", "PSP", "HW_FAULT", + ) + hits: list[str] = [] + for line in text.splitlines(): + if "amdgpu" not in line and "amdkfd" not in line: + continue + if any(s.lower() in line.lower() for s in interesting): + hits.append(line.strip()[:300]) + e.dmesg_amdgpu_tail = hits[-15:] + + +# --------------------------------------------------------------------------- +# Windows-specific probes +# +# Windows has no equivalent of /sys, /proc, lsmod, lspci, or rocminfo. Almost +# everything we need is reachable through PowerShell + CIM (Win32_*) and a +# couple of well-known install directories. The HIP SDK ships hipInfo.exe, +# which is the rocminfo analog. The kernel-mode GPU driver is part of the +# AMD Adrenalin install and reports itself via Win32_VideoController. +# --------------------------------------------------------------------------- + +def _probe_gpus_windows(e: Examination) -> None: + """Enumerate AMD/NVIDIA display adapters via Win32_VideoController.""" + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "Get-CimInstance Win32_VideoController | " + "Select-Object -Property Name,PNPDeviceID,DriverVersion | " + "ConvertTo-Json -Compress", + ], timeout=10) + if rc != 0 or not out.strip(): + e.probe_failures.append( + "Get-CimInstance Win32_VideoController failed; cannot enumerate GPUs." + ) + return + try: + data = json.loads(out) + except json.JSONDecodeError: + e.probe_failures.append("Win32_VideoController returned non-JSON output.") + return + if isinstance(data, dict): + data = [data] + for entry in data: + if not isinstance(entry, dict): + continue + name = (entry.get("Name") or "").strip() + pnp = (entry.get("PNPDeviceID") or "").strip() + is_amd = "VEN_1002" in pnp.upper() or "AMD" in name.upper() or "RADEON" in name.upper() + is_nvidia = "VEN_10DE" in pnp.upper() or "NVIDIA" in name.upper() + if is_nvidia: + e.has_nvidia_gpu = True + e.gpus.append(GPU(name=name, pci_id=pnp, is_amd=False, is_apu=False)) + continue + if not is_amd: + continue + gfx_guess, is_apu_guess = _classify_amd_marketing_name(name) + e.gpus.append(GPU( + name=name, gfx_target=gfx_guess, pci_id=pnp, + is_apu=is_apu_guess, is_amd=True, + )) + + +def _probe_hip_sdk_windows(e: Examination) -> None: + """Locate the HIP SDK install and run hipInfo for ground-truth gfx target. + + The HIP SDK installer drops files under `C:\\Program Files\\AMD\\ROCm\\\\` + by default and sets `HIP_PATH` (and `HIP_PATH_`) in the user/machine + environment. Multiple SDKs can coexist; we prefer `HIP_PATH` when set + because that's what loaders actually pick up. + """ + candidates: list[Path] = [] + hp = os.environ.get("HIP_PATH") + if hp and Path(hp).is_dir(): + candidates.append(Path(hp)) + for root in (r"C:\Program Files\AMD\ROCm", r"C:\Program Files (x86)\AMD\ROCm"): + try: + base = Path(root) + if base.is_dir(): + for child in sorted(base.iterdir(), reverse=True): + if child.is_dir() and re.match(r"\d+(\.\d+)+", child.name): + candidates.append(child) + except OSError: + continue + seen: set[str] = set() + chosen: Path | None = None + for c in candidates: + s = str(c) + if s in seen: + continue + seen.add(s) + if chosen is None: + chosen = c + if chosen is None: + return + e.hip_sdk_path = str(chosen) + m = re.search(r"(\d+(?:\.\d+)+)$", chosen.name) + if m: + e.hip_sdk_version = m.group(1) + + hipinfo = chosen / "bin" / "hipInfo.exe" + if not hipinfo.exists(): + e.hipinfo_present = False + e.hipinfo_status = "missing" + return + e.hipinfo_present = True + rc, out, err = _run([str(hipinfo)], timeout=15) + if rc != 0: + merged = (out + "\n" + err).lower() + if "no rocm" in merged or "no devices" in merged: + e.hipinfo_status = "not-loaded" + else: + e.hipinfo_status = f"error rc={rc}" + return + e.hipinfo_status = "ok" + + # hipInfo prints a `device# 0` block with `Name:` (gfx target) and + # `gcnArchName:` lines. Parse the first GPU device for ground truth. + gfx = "" + name = "" + for line in out.splitlines(): + s = line.strip() + if s.startswith("Name:") and not name: + name = s.split(":", 1)[1].strip() + if s.startswith("gcnArchName:") and not gfx: + val = s.split(":", 1)[1].strip() + if val.startswith("gfx"): + gfx = val.split(":")[0] + if s.startswith("arch:") and not gfx: + val = s.split(":", 1)[1].strip() + if val.startswith("gfx"): + gfx = val + if gfx and name: + break + amd_entries = [g for g in e.gpus if g.is_amd] + if gfx and amd_entries: + if not amd_entries[0].gfx_target: + amd_entries[0].gfx_target = gfx + amd_entries[0].is_apu = bool(re.match(r"gfx11[05]\d", gfx)) + if name and not amd_entries[0].name: + amd_entries[0].name = name + + +def _probe_adrenalin_windows(e: Examination) -> None: + """Best-effort probe of the AMD Adrenalin / kernel-mode driver version.""" + rc, out, _ = _run([ + "powershell", "-NoProfile", "-Command", + "(Get-CimInstance Win32_VideoController | " + "Where-Object { $_.Name -like '*AMD*' -or $_.Name -like '*Radeon*' } | " + "Select-Object -First 1).DriverVersion", + ], timeout=8) + if rc == 0 and out.strip(): + e.adrenalin_version = out.strip().splitlines()[0].strip() + + +def _probe_msvc_redist_windows(e: Examination) -> None: + """Check whether `vcruntime140.dll` and `vcruntime140_1.dll` are loadable. + + The HIP SDK's amdhip64_*.dll links against the MSVC 2015-2022 runtime; + when the redistributable isn't installed, `import torch` fails with a + DLL-load error that points at vcruntime140_1.dll. + """ + paths = os.environ.get("PATH", "").split(os.pathsep) + sysroot = os.environ.get("SystemRoot") or r"C:\Windows" + paths.extend([ + os.path.join(sysroot, "System32"), + os.path.join(sysroot, "SysWOW64"), + ]) + have_140 = False + have_140_1 = False + for d in paths: + if not d: + continue + try: + p = Path(d) + if not p.is_dir(): + continue + if (p / "vcruntime140.dll").exists(): + have_140 = True + if (p / "vcruntime140_1.dll").exists(): + have_140_1 = True + except OSError: + continue + if have_140 and have_140_1: + break + e.msvc_redist_present = have_140 and have_140_1 + + +def _probe_env_windows(e: Examination) -> None: + """Capture the env vars the diagnosis catalog reads on Windows. + + Mirrors `_probe_env` for Linux but skips the LD_LIBRARY_PATH scan + (Windows uses the PATH-based DLL search instead). + """ + for k in TRACKED_ENV_VARS: + v = os.environ.get(k) + if v is None: + continue + if k == "PATH" and len(v) > 4000: + v = v[:4000] + "...[truncated]" + e.env[k] = v + + +# --------------------------------------------------------------------------- +# Orchestration +# --------------------------------------------------------------------------- + +def examine(requested_framework: str | None) -> Examination: + e = Examination() + _probe_os(e) + if e.is_wsl: + # WSL is a real, common environment but the failure modes there + # (Adrenalin Pro on the Windows host, the WSL kernel update, the + # /usr/lib/wsl/lib loader handoff) are NOT in this catalog. Refuse + # explicitly rather than running Linux-native probes that would all + # mislead. + e.notes.append( + "Detected WSL2. rocm-doctor does not cover the ROCm-on-WSL flow " + "(it requires Adrenalin Pro + the WSL kernel update on the " + "Windows host). Either run this script on the native Linux " + "host, or follow AMD's WSL guide directly: " + "https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/install/installryz/wsl/howto_wsl.html" + ) + return e + if e.os_family == "linux": + _probe_cpu(e) + _probe_gpus_lspci(e) + _probe_gpus_rocminfo(e) + _summarise_gpu_categories(e) + _probe_modules(e) + _probe_devices(e) + _probe_user(e) + _probe_secure_boot(e) + _probe_rocm_install(e) + _probe_env(e) + _probe_container(e) + _probe_dmesg_amdgpu(e) + _probe_framework(e, requested_framework) + return e + if e.os_family == "windows": + _probe_cpu(e) + _probe_gpus_windows(e) + _probe_hip_sdk_windows(e) + _probe_adrenalin_windows(e) + _probe_msvc_redist_windows(e) + _summarise_gpu_categories(e) + _probe_env_windows(e) + _probe_framework(e, requested_framework) + return e + e.notes.append( + f"rocm-doctor supports Linux and Windows; got {e.os_family}. " + "This skill cannot help on this platform." + ) + return e + + +# --------------------------------------------------------------------------- +# Human-readable rendering +# --------------------------------------------------------------------------- + +def _fmt_yesno(v: bool | None) -> str: + return "unknown" if v is None else ("yes" if v else "no") + + +def _print_gpus(e: Examination) -> None: + print("\nGPUs:") + if not e.gpus: + if e.os_family == "linux": + print(" (none detected; lspci returned no VGA/3D/Display controllers)") + else: + print(" (none detected; Win32_VideoController returned no AMD/NVIDIA adapters)") + for g in e.gpus: + flag = "" + if g.is_amd and g.is_apu: + flag = " [AMD APU]" + elif g.is_amd: + flag = " [AMD dGPU]" + elif "NVIDIA" in g.name.upper(): + flag = " [NVIDIA]" + print(f" - {g.pci_id} {g.name or 'unknown'} gfx={g.gfx_target or '?'}{flag}") + + +def _print_framework_block(e: Examination) -> None: + print("\nFramework:") + print(f" detected: {e.framework}") + if e.framework_version: + print(f" version: {e.framework_version}") + if e.framework_rocm_version: + print(f" rocm/hip: {e.framework_rocm_version}") + if e.framework_arch_list: + print(f" arch list: {' '.join(e.framework_arch_list)}") + for n in e.framework_notes: + print(f" note: {n}") + + +def _print_env_block(e: Examination) -> None: + if not e.env: + return + print("\nRelevant environment variables (set in current shell):") + for k, v in e.env.items(): + display = v if len(v) <= 200 else (v[:200] + "...") + print(f" {k}={display}") + + +def _print_human_linux(e: Examination) -> None: + print(f"Kernel: {e.kernel_release}") + if e.iommu_kernel_param: + print(f" iommu= {e.iommu_kernel_param}") + print(f"CPU: {e.cpu_model} (vendor: {e.cpu_vendor})") + if e.secure_boot != "unknown": + print(f"Secure Boot: {e.secure_boot}") + if e.in_container: + print(f"Container: yes ({e.container_kind})") + + _print_gpus(e) + + print("\nDriver & devices:") + print(f" amdgpu loaded: {_fmt_yesno(e.amdgpu_loaded)}") + if e.amdgpu_blacklisted_in: + print(f" amdgpu blacklisted in: {', '.join(e.amdgpu_blacklisted_in)}") + print(f" amdkfd loaded: {_fmt_yesno(e.amdkfd_loaded)}") + print(f" rocminfo: {e.rocminfo_status}") + if e.kfd: + print(f" /dev/kfd: exists={e.kfd.exists} mode={e.kfd.mode} " + f"owner={e.kfd.owner_user}:{e.kfd.owner_group} " + f"r={_fmt_yesno(e.kfd.user_can_read)} w={_fmt_yesno(e.kfd.user_can_write)}") + for d in e.render_devices: + print(f" {d.path}: mode={d.mode} owner={d.owner_user}:{d.owner_group} " + f"r={_fmt_yesno(d.user_can_read)} w={_fmt_yesno(d.user_can_write)}") + + print("\nUser:") + print(f" name: {e.user_name or 'unknown'}") + print(f" in render group: {_fmt_yesno(e.in_render_group)}") + print(f" in video group: {_fmt_yesno(e.in_video_group)}") + if e.user_groups: + print(f" all groups: {' '.join(e.user_groups)}") + + print("\nROCm install:") + print(f" path: {e.rocm_path or 'not found'}") + print(f" version: {e.rocm_version or 'unknown'}") + print(f" install method: {e.rocm_install_method or 'unknown'}") + if e.rocm_repos_seen: + print(f" repos seen: {len(e.rocm_repos_seen)} file(s)") + for r in e.rocm_repos_seen: + print(f" - {r}") + + _print_framework_block(e) + _print_env_block(e) + + if e.dmesg_amdgpu_tail: + print("\nRecent amdgpu/amdkfd kernel messages (last few interesting lines):") + for line in e.dmesg_amdgpu_tail: + print(f" | {line}") + + +def _print_human_windows(e: Examination) -> None: + print(f"CPU: {e.cpu_model or 'unknown'} (vendor: {e.cpu_vendor})") + + _print_gpus(e) + + print("\nDriver & runtime:") + print(f" Adrenalin driver: {e.adrenalin_version or 'unknown'}") + print(f" hipInfo: {e.hipinfo_status or 'missing'}") + print(f" MSVC redist: {_fmt_yesno(e.msvc_redist_present)}") + + print("\nHIP SDK install:") + print(f" path: {e.hip_sdk_path or 'not found'}") + print(f" version: {e.hip_sdk_version or 'unknown'}") + + _print_framework_block(e) + _print_env_block(e) + + +def _print_human(e: Examination) -> None: + print("rocm-doctor -- system examination (read-only)") + print("=" * 60) + print(f"OS: {e.os_family} {e.distro_id} {e.distro_version}".strip()) + if e.is_wsl: + print("WSL: yes (out of scope; see notes)") + if e.is_wsl or e.os_family not in ("linux", "windows"): + for n in e.notes: + print(f" note: {n}") + return + + if e.os_family == "linux": + _print_human_linux(e) + elif e.os_family == "windows": + _print_human_windows(e) + + if e.probe_failures: + print("\nProbes that did not complete:") + for p in e.probe_failures: + print(f" - {p}") + + if e.notes: + print("\nNotes:") + for n in e.notes: + print(f" - {n}") + + print("\nNext step: feed this examination into diagnose.py:") + print(" python scripts/examine.py --json > exam.json") + print(" python scripts/diagnose.py --exam exam.json --symptom \"\"") + + +def _to_jsonable(e: Examination) -> dict: + """asdict() handles nested dataclasses; we just rename Optional[Device].""" + d = asdict(e) + return d + + +def _exit_code(e: Examination) -> int: + if e.is_wsl: + # WSL is detected but explicitly out of scope. Treat like "wrong + # platform" so the agent stops and routes the user. + return 2 + if e.os_family not in ("linux", "windows"): + return 2 + if not e.has_amd_gpu: + # NVIDIA-only or no GPU at all -- this skill can't help. + return 2 + # Probes that didn't complete are a soft warning, not a hard fail. + if e.os_family == "linux": + if e.probe_failures and not e.rocminfo_present and not e.gpus: + return 3 + else: # windows + if e.probe_failures and not e.hipinfo_present and not e.gpus: + return 3 + return 0 + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--json", action="store_true", + help="Emit machine-readable JSON for diagnose.py.") + parser.add_argument( + "--framework", + choices=["pytorch", "llama-cpp", "skip", "auto"], + default="auto", + help="Which framework probe to run (default: auto-detect).", + ) + args = parser.parse_args(argv) + + requested = None if args.framework == "auto" else args.framework + e = examine(requested) + if args.json: + print(json.dumps(_to_jsonable(e), indent=2)) + else: + _print_human(e) + return _exit_code(e) + + +if __name__ == "__main__": + raise SystemExit(main())