Skip to content

Commit 0c6d5f0

Browse files
fix(installer): fall back when AMD devices are unavailable
Closes #1131
1 parent add10dd commit 0c6d5f0

12 files changed

Lines changed: 400 additions & 39 deletions

File tree

dream-server/docs/TROUBLESHOOTING.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,28 @@ sudo nvidia-ctk runtime configure --runtime=docker
3838
sudo systemctl restart docker
3939
```
4040

41+
### AMD GPU Devices Missing in LXD/LXC
42+
43+
**Error:** installer reports missing `/dev/kfd`, `/dev/dri`, or `/dev/dri/renderD*`, or AMD services never become healthy inside an LXD container.
44+
45+
**Cause:** LXD/LXC containers can expose enough host CPU/sysfs information for Dream Server to recognize AMD/Strix Halo hardware while still hiding the actual GPU device nodes that Docker must mount into ROCm containers.
46+
47+
**Fix for GPU acceleration:** pass the GPU devices from the LXD host into the container, then re-run the installer.
48+
49+
```bash
50+
lxc config set <container> security.nesting=true
51+
lxc config device add <container> gpu gpu
52+
lxc config device add <container> kfd unix-char path=/dev/kfd
53+
```
54+
55+
**CPU fallback:** if you do not need GPU acceleration inside the container, run:
56+
57+
```bash
58+
GPU_BACKEND=cpu ./install.sh
59+
```
60+
61+
Fresh installs now fall back to CPU mode automatically when AMD is auto-detected but the required device nodes are unavailable inside a container. If you explicitly force `GPU_BACKEND=amd`, the installer fails fast with passthrough guidance instead.
62+
4163
---
4264

4365
## Startup Issues

dream-server/installers/lib/compose-select.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ resolve_compose_config() {
5151
COMPOSE_FLAGS="-f docker-compose.base.yml"
5252
COMPOSE_FILE="docker-compose.base.yml"
5353
fi
54+
elif [[ "$GPU_BACKEND" == "cpu" ]]; then
55+
if [[ -f "$SCRIPT_DIR/docker-compose.base.yml" && -f "$SCRIPT_DIR/docker-compose.cpu.yml" ]]; then
56+
COMPOSE_FLAGS="-f docker-compose.base.yml -f docker-compose.cpu.yml"
57+
COMPOSE_FILE="docker-compose.cpu.yml"
58+
fi
5459
elif [[ "$TIER" == "SH_LARGE" || "$TIER" == "SH_COMPACT" ]]; then
5560
if [[ -f "$SCRIPT_DIR/docker-compose.base.yml" && -f "$SCRIPT_DIR/docker-compose.amd.yml" ]]; then
5661
COMPOSE_FLAGS="-f docker-compose.base.yml -f docker-compose.amd.yml"
@@ -66,11 +71,6 @@ resolve_compose_config() {
6671
COMPOSE_FLAGS="-f docker-compose.base.yml -f docker-compose.intel.yml"
6772
COMPOSE_FILE="docker-compose.intel.yml"
6873
fi
69-
elif [[ "$GPU_BACKEND" == "cpu" ]]; then
70-
if [[ -f "$SCRIPT_DIR/docker-compose.base.yml" && -f "$SCRIPT_DIR/docker-compose.cpu.yml" ]]; then
71-
COMPOSE_FLAGS="-f docker-compose.base.yml -f docker-compose.cpu.yml"
72-
COMPOSE_FILE="docker-compose.cpu.yml"
73-
fi
7474
else
7575
if [[ -f "$SCRIPT_DIR/docker-compose.base.yml" && -f "$SCRIPT_DIR/docker-compose.nvidia.yml" ]]; then
7676
COMPOSE_FLAGS="-f docker-compose.base.yml -f docker-compose.nvidia.yml"

dream-server/installers/lib/detection.sh

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,132 @@ calculate_llama_cpu_budget() {
146146
echo "$limit $reservation $available"
147147
}
148148

149+
ds_in_container() {
150+
[[ -f /.dockerenv ]] && return 0
151+
[[ -f /run/.containerenv ]] && return 0
152+
if command -v systemd-detect-virt >/dev/null 2>&1; then
153+
systemd-detect-virt --container --quiet 2>/dev/null && return 0
154+
fi
155+
if [[ -f /proc/1/cgroup ]] && grep -qiE '(docker|containerd|kubepods|lxc|libpod)' /proc/1/cgroup 2>/dev/null; then
156+
return 0
157+
fi
158+
if [[ -f /proc/1/environ ]] && awk -v RS='\0' -F= '$1 == "container" { found = 1 } END { exit !found }' /proc/1/environ 2>/dev/null; then
159+
return 0
160+
fi
161+
return 1
162+
}
163+
164+
ds_container_label() {
165+
if command -v systemd-detect-virt >/dev/null 2>&1; then
166+
local virt
167+
virt=$(systemd-detect-virt --container 2>/dev/null || true)
168+
[[ -n "$virt" && "$virt" != "none" ]] && { echo "$virt"; return 0; }
169+
fi
170+
if [[ -f /proc/1/environ ]]; then
171+
local env_container
172+
env_container=$(awk -v RS='\0' -F= '$1 == "container" { print $2; exit }' /proc/1/environ 2>/dev/null || true)
173+
[[ -n "$env_container" ]] && { echo "$env_container"; return 0; }
174+
fi
175+
if [[ -f /.dockerenv ]]; then
176+
echo "docker"
177+
elif [[ -f /run/.containerenv ]]; then
178+
echo "container"
179+
elif [[ -f /proc/1/cgroup ]] && grep -qiE 'lxc|lxd' /proc/1/cgroup 2>/dev/null; then
180+
echo "lxc"
181+
else
182+
echo "container"
183+
fi
184+
}
185+
186+
amd_gpu_missing_runtime_devices() {
187+
local root="${DREAM_AMD_DEVICE_ROOT:-/dev}"
188+
local kfd="$root/kfd"
189+
local dri="$root/dri"
190+
local missing=()
191+
192+
if [[ -n "${DREAM_AMD_DEVICE_ROOT:-}" ]]; then
193+
[[ -e "$kfd" ]] || missing+=("$kfd")
194+
else
195+
[[ -c "$kfd" ]] || missing+=("$kfd")
196+
fi
197+
198+
if [[ ! -d "$dri" ]]; then
199+
missing+=("$dri")
200+
elif ! compgen -G "$dri/renderD*" >/dev/null; then
201+
missing+=("$dri/renderD*")
202+
fi
203+
204+
printf '%s\n' "${missing[@]}"
205+
}
206+
207+
amd_gpu_runtime_devices_available() {
208+
[[ -z "$(amd_gpu_missing_runtime_devices)" ]]
209+
}
210+
211+
amd_gpu_missing_devices_csv() {
212+
local missing
213+
missing="$(amd_gpu_missing_runtime_devices | xargs || true)"
214+
printf '%s' "${missing// /, }"
215+
}
216+
217+
show_amd_gpu_device_guidance() {
218+
local missing="${1:-$(amd_gpu_missing_devices_csv)}"
219+
ai_warn "AMD GPU device nodes unavailable: ${missing:-unknown}"
220+
if ds_in_container; then
221+
local container_label
222+
container_label="$(ds_container_label)"
223+
ai "Container environment detected (${container_label}). LXD/LXC containers can see CPU/sysfs clues while GPU device nodes stay hidden."
224+
ai "For LXD GPU acceleration, pass the devices from the host, for example:"
225+
ai " lxc config device add <container> gpu gpu"
226+
ai " lxc config device add <container> kfd unix-char path=/dev/kfd"
227+
else
228+
ai "On native Linux, make sure the amdgpu/amdkfd modules are loaded and /dev/dri/renderD* exists."
229+
ai " sudo modprobe amdgpu"
230+
ai " sudo modprobe amdkfd"
231+
fi
232+
}
233+
234+
apply_cpu_gpu_fallback() {
235+
local reason="${1:-AMD GPU runtime devices are unavailable.}"
236+
ai_warn "$reason"
237+
ai "Using CPU mode so installation can complete without GPU passthrough."
238+
239+
GPU_BACKEND="cpu"
240+
GPU_NAME="None (CPU fallback)"
241+
GPU_VRAM=0
242+
GPU_COUNT=0
243+
GPU_MEMORY_TYPE="none"
244+
GPU_DEVICE_ID=""
245+
HAS_NPU=false
246+
[[ "${DREAM_MODE:-local}" == "lemonade" ]] && DREAM_MODE="local"
247+
BACKEND_ID="cpu"
248+
CAP_LLM_BACKEND="cpu"
249+
CAP_GPU_VENDOR="cpu"
250+
CAP_GPU_NAME="$GPU_NAME"
251+
CAP_GPU_VRAM_MB=0
252+
CAP_GPU_COUNT=0
253+
CAP_GPU_MEMORY_TYPE="none"
254+
CAP_RECOMMENDED_TIER=""
255+
CAP_COMPOSE_OVERLAYS=""
256+
}
257+
258+
select_cpu_fallback_tier() {
259+
local ram_gb="${1:-0}"
260+
if ! [[ "$ram_gb" =~ ^[0-9]+$ ]]; then
261+
ram_gb=0
262+
fi
263+
264+
if [[ "$ram_gb" -ge 96 ]]; then
265+
echo "3"
266+
elif [[ "$ram_gb" -ge 48 ]]; then
267+
echo "2"
268+
elif [[ "$ram_gb" -lt 12 ]]; then
269+
echo "0"
270+
else
271+
echo "1"
272+
fi
273+
}
274+
149275
detect_gpu() {
150276
GPU_BACKEND="cpu" # default to CPU-only fallback
151277
GPU_MEMORY_TYPE="none"

dream-server/installers/lib/ui.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ check_service() {
265265
local url=$2
266266
local max_attempts=${3:-30}
267267
local timeout=${4:-10} # Timeout per request (default 10s)
268+
local container_name=${5:-}
268269
local spin='⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏'
269270
local i=0
270271
local lore_idx=$(( RANDOM % ${#LORE_MESSAGES[@]} ))
@@ -294,6 +295,24 @@ check_service() {
294295
local curl_exit=$?
295296
elapsed=$((elapsed + backoff))
296297

298+
if [[ -n "$container_name" ]]; then
299+
local docker_cmd="${DOCKER_CMD:-docker}"
300+
local -a docker_cmd_arr=()
301+
read -r -a docker_cmd_arr <<< "$docker_cmd"
302+
[[ ${#docker_cmd_arr[@]} -gt 0 ]] || docker_cmd_arr=(docker)
303+
local container_state=""
304+
if command -v "${docker_cmd_arr[0]}" >/dev/null 2>&1; then
305+
container_state=$("${docker_cmd_arr[@]}" inspect --format '{{.State.Status}}' "$container_name" 2>/dev/null || echo "missing")
306+
case "$container_state" in
307+
exited|dead|missing)
308+
printf "\r ${RED}${NC} %-55s\n" "$name container $container_state"
309+
ai_warn "$name container is $container_state; not retrying health probe."
310+
return 1
311+
;;
312+
esac
313+
fi
314+
fi
315+
297316
# Distinguish between timeout (124), connection refused (7),
298317
# and transient startup errors (56 = recv error, 52 = empty reply)
299318
if [[ $curl_exit -eq 124 ]]; then

dream-server/installers/phases/02-detection.sh

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@
2727
dream_progress 12 "detection" "Detecting GPU hardware"
2828
chapter "SYSTEM DETECTION"
2929

30+
GPU_BACKEND_REQUESTED="${GPU_BACKEND:-}"
31+
GPU_BACKEND_FORCED=false
32+
[[ "${GPU_BACKEND_REQUESTED,,}" == "amd" ]] && GPU_BACKEND_FORCED=true
33+
GPU_BACKEND_FORCED_CPU=false
34+
[[ "${GPU_BACKEND_REQUESTED,,}" == "cpu" ]] && GPU_BACKEND_FORCED_CPU=true
35+
TIER_REQUESTED="${TIER:-}"
36+
TIER_FORCED=false
37+
[[ -n "$TIER_REQUESTED" ]] && TIER_FORCED=true
38+
3039
# Cloud mode: skip GPU detection entirely
3140
if [[ "${DREAM_MODE:-local}" == "cloud" ]]; then
3241
ai "Cloud mode — skipping GPU detection"
@@ -108,22 +117,43 @@ DISK_AVAIL=$(df -BG "$HOME" | tail -1 | awk '{print $4}' | tr -d 'G')
108117
log "Available disk: ${DISK_AVAIL}GB"
109118

110119
# GPU Detection
111-
ai "Detecting GPU..."
112-
detect_gpu || true
120+
if [[ "$GPU_BACKEND_FORCED_CPU" == "true" ]]; then
121+
ai "GPU_BACKEND=cpu requested - skipping GPU detection"
122+
apply_cpu_gpu_fallback "GPU_BACKEND=cpu was requested."
123+
else
124+
ai "Detecting GPU..."
125+
detect_gpu || true
113126

114-
if [[ "${CAP_PROFILE_LOADED:-false}" == "true" ]]; then
115-
case "${CAP_LLM_BACKEND:-}" in
116-
amd) GPU_BACKEND="amd" ;;
117-
intel) GPU_BACKEND="intel" ;;
118-
cpu) GPU_BACKEND="cpu" ;;
119-
apple) GPU_BACKEND="apple" ;;
120-
*) GPU_BACKEND="nvidia" ;;
121-
esac
122-
[[ -n "${CAP_GPU_MEMORY_TYPE:-}" ]] && GPU_MEMORY_TYPE="${CAP_GPU_MEMORY_TYPE}"
123-
[[ -n "${CAP_GPU_NAME:-}" ]] && GPU_NAME="${CAP_GPU_NAME}"
124-
[[ -n "${CAP_GPU_VRAM_MB:-}" ]] && GPU_VRAM="${CAP_GPU_VRAM_MB}"
125-
[[ -n "${CAP_GPU_COUNT:-}" ]] && GPU_COUNT="${CAP_GPU_COUNT}"
126-
log "Capabilities override detection: backend=${GPU_BACKEND}, memory=${GPU_MEMORY_TYPE}, tier=${CAP_RECOMMENDED_TIER:-unknown}"
127+
if [[ "${CAP_PROFILE_LOADED:-false}" == "true" ]]; then
128+
case "${CAP_LLM_BACKEND:-}" in
129+
amd) GPU_BACKEND="amd" ;;
130+
intel) GPU_BACKEND="intel" ;;
131+
cpu) GPU_BACKEND="cpu" ;;
132+
apple) GPU_BACKEND="apple" ;;
133+
*) GPU_BACKEND="nvidia" ;;
134+
esac
135+
[[ -n "${CAP_GPU_MEMORY_TYPE:-}" ]] && GPU_MEMORY_TYPE="${CAP_GPU_MEMORY_TYPE}"
136+
[[ -n "${CAP_GPU_NAME:-}" ]] && GPU_NAME="${CAP_GPU_NAME}"
137+
[[ -n "${CAP_GPU_VRAM_MB:-}" ]] && GPU_VRAM="${CAP_GPU_VRAM_MB}"
138+
[[ -n "${CAP_GPU_COUNT:-}" ]] && GPU_COUNT="${CAP_GPU_COUNT}"
139+
log "Capabilities override detection: backend=${GPU_BACKEND}, memory=${GPU_MEMORY_TYPE}, tier=${CAP_RECOMMENDED_TIER:-unknown}"
140+
fi
141+
142+
if [[ "$GPU_BACKEND" == "amd" ]] && ! amd_gpu_runtime_devices_available; then
143+
_amd_missing_devices="$(amd_gpu_missing_devices_csv)"
144+
if [[ "${GPU_BACKEND_FORCED:-false}" == "true" ]]; then
145+
ai_bad "GPU_BACKEND=amd was explicitly requested, but required AMD device nodes are missing."
146+
show_amd_gpu_device_guidance "$_amd_missing_devices"
147+
error "Cannot continue with AMD GPU mode until device passthrough is available."
148+
elif ds_in_container; then
149+
ai_warn "AMD hardware was detected, but this container cannot access the AMD GPU devices."
150+
show_amd_gpu_device_guidance "$_amd_missing_devices"
151+
apply_cpu_gpu_fallback "Falling back to CPU mode because AMD GPU passthrough is unavailable in this container."
152+
else
153+
ai_warn "AMD GPU runtime devices not ready yet: ${_amd_missing_devices:-unknown}"
154+
ai "Continuing for now; AMD tuning will try to load kernel modules before services start."
155+
fi
156+
fi
127157
fi
128158

129159
BACKEND_ID="$GPU_BACKEND"
@@ -155,7 +185,7 @@ fi
155185
#-----------------------------------------------------------------------------
156186
# If detect_gpu found no working GPU, check if it's a fixable driver/Secure Boot issue
157187
# (Only for NVIDIA — AMD APU is handled above)
158-
if [[ $GPU_COUNT -eq 0 && "$GPU_BACKEND" != "amd" ]] && ! $DRY_RUN; then
188+
if [[ "${GPU_BACKEND_FORCED_CPU:-false}" != "true" && $GPU_COUNT -eq 0 && "$GPU_BACKEND" != "amd" ]] && ! $DRY_RUN; then
159189
fix_nvidia_secure_boot || true
160190
fi
161191

dream-server/installers/phases/11-services.sh

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,58 @@ if $DRY_RUN; then
2525
else
2626
cd "$INSTALL_DIR" || exit 1
2727

28+
_phase11_env_set() {
29+
local key="$1" value="$2" env_file="$INSTALL_DIR/.env" tmp_file
30+
[[ -f "$env_file" ]] || return 0
31+
tmp_file="${env_file}.tmp.$$"
32+
awk -v k="$key" -v v="$value" '
33+
BEGIN { found = 0 }
34+
index($0, k "=") == 1 { print k "=" v; found = 1; next }
35+
{ print }
36+
END { if (!found) print k "=" v }
37+
' "$env_file" > "$tmp_file" && cat "$tmp_file" > "$env_file" && rm -f "$tmp_file"
38+
}
39+
40+
_phase11_apply_cpu_fallback() {
41+
local missing="$1"
42+
show_amd_gpu_device_guidance "$missing"
43+
apply_cpu_gpu_fallback "Falling back to CPU mode before launching services."
44+
45+
if [[ "${TIER_FORCED:-false}" != "true" ]]; then
46+
TIER="$(select_cpu_fallback_tier "${RAM_GB:-0}")"
47+
log "CPU fallback tier selected: $TIER"
48+
fi
49+
50+
load_backend_contract "cpu" || true
51+
LLM_HEALTHCHECK_URL="${BACKEND_PUBLIC_HEALTH_URL:-http://localhost:8080/health}"
52+
LLM_PUBLIC_API_PORT="${BACKEND_PUBLIC_API_PORT:-8080}"
53+
OPENCLAW_PROVIDER_NAME_DEFAULT="${BACKEND_PROVIDER_NAME:-local-llama}"
54+
OPENCLAW_PROVIDER_URL_DEFAULT="${BACKEND_PROVIDER_URL:-http://llama-server:8080/v1}"
55+
resolve_tier_config
56+
GPU_BACKEND="cpu"
57+
58+
_phase11_env_set GPU_BACKEND "$GPU_BACKEND"
59+
_phase11_env_set DREAM_MODE "local"
60+
_phase11_env_set LLM_API_URL "http://llama-server:8080"
61+
_phase11_env_set LLM_MODEL "$LLM_MODEL"
62+
_phase11_env_set GGUF_FILE "$GGUF_FILE"
63+
_phase11_env_set MAX_CONTEXT "$MAX_CONTEXT"
64+
_phase11_env_set CTX_SIZE "$MAX_CONTEXT"
65+
_phase11_env_set AUDIO_STT_MODEL "Systran/faster-whisper-base"
66+
_phase11_env_set LLAMA_SERVER_IMAGE "${LLAMA_SERVER_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-b8248}"
67+
ai_ok "Rewrote .env for CPU fallback"
68+
}
69+
70+
if [[ "${GPU_BACKEND:-}" == "amd" ]] && ! amd_gpu_runtime_devices_available; then
71+
_amd_missing_devices="$(amd_gpu_missing_devices_csv)"
72+
if [[ "${GPU_BACKEND_FORCED:-false}" == "true" ]]; then
73+
ai_bad "GPU_BACKEND=amd was explicitly requested, but required AMD device nodes are missing."
74+
show_amd_gpu_device_guidance "$_amd_missing_devices"
75+
exit 1
76+
fi
77+
_phase11_apply_cpu_fallback "$_amd_missing_devices"
78+
fi
79+
2880
# Re-resolve compose flags against the actual install directory.
2981
# Phase 03 may have disabled services (e.g., ComfyUI on Tier 0) after
3082
# COMPOSE_FLAGS was first set in Phase 02, making the cached value stale.

0 commit comments

Comments
 (0)