|
27 | 27 | dream_progress 12 "detection" "Detecting GPU hardware" |
28 | 28 | chapter "SYSTEM DETECTION" |
29 | 29 |
|
| 30 | +GPU_BACKEND_REQUESTED="${GPU_BACKEND:-}" |
| 31 | +GPU_BACKEND_FORCED=false |
| 32 | +[[ "${GPU_BACKEND_REQUESTED,,}" == "amd" ]] && GPU_BACKEND_FORCED=true |
| 33 | +GPU_BACKEND_FORCED_CPU=false |
| 34 | +[[ "${GPU_BACKEND_REQUESTED,,}" == "cpu" ]] && GPU_BACKEND_FORCED_CPU=true |
| 35 | +TIER_REQUESTED="${TIER:-}" |
| 36 | +TIER_FORCED=false |
| 37 | +[[ -n "$TIER_REQUESTED" ]] && TIER_FORCED=true |
| 38 | + |
30 | 39 | # Cloud mode: skip GPU detection entirely |
31 | 40 | if [[ "${DREAM_MODE:-local}" == "cloud" ]]; then |
32 | 41 | ai "Cloud mode — skipping GPU detection" |
@@ -108,22 +117,43 @@ DISK_AVAIL=$(df -BG "$HOME" | tail -1 | awk '{print $4}' | tr -d 'G') |
108 | 117 | log "Available disk: ${DISK_AVAIL}GB" |
109 | 118 |
|
110 | 119 | # GPU Detection |
111 | | -ai "Detecting GPU..." |
112 | | -detect_gpu || true |
| 120 | +if [[ "$GPU_BACKEND_FORCED_CPU" == "true" ]]; then |
| 121 | + ai "GPU_BACKEND=cpu requested - skipping GPU detection" |
| 122 | + apply_cpu_gpu_fallback "GPU_BACKEND=cpu was requested." |
| 123 | +else |
| 124 | + ai "Detecting GPU..." |
| 125 | + detect_gpu || true |
113 | 126 |
|
114 | | -if [[ "${CAP_PROFILE_LOADED:-false}" == "true" ]]; then |
115 | | - case "${CAP_LLM_BACKEND:-}" in |
116 | | - amd) GPU_BACKEND="amd" ;; |
117 | | - intel) GPU_BACKEND="intel" ;; |
118 | | - cpu) GPU_BACKEND="cpu" ;; |
119 | | - apple) GPU_BACKEND="apple" ;; |
120 | | - *) GPU_BACKEND="nvidia" ;; |
121 | | - esac |
122 | | - [[ -n "${CAP_GPU_MEMORY_TYPE:-}" ]] && GPU_MEMORY_TYPE="${CAP_GPU_MEMORY_TYPE}" |
123 | | - [[ -n "${CAP_GPU_NAME:-}" ]] && GPU_NAME="${CAP_GPU_NAME}" |
124 | | - [[ -n "${CAP_GPU_VRAM_MB:-}" ]] && GPU_VRAM="${CAP_GPU_VRAM_MB}" |
125 | | - [[ -n "${CAP_GPU_COUNT:-}" ]] && GPU_COUNT="${CAP_GPU_COUNT}" |
126 | | - log "Capabilities override detection: backend=${GPU_BACKEND}, memory=${GPU_MEMORY_TYPE}, tier=${CAP_RECOMMENDED_TIER:-unknown}" |
| 127 | + if [[ "${CAP_PROFILE_LOADED:-false}" == "true" ]]; then |
| 128 | + case "${CAP_LLM_BACKEND:-}" in |
| 129 | + amd) GPU_BACKEND="amd" ;; |
| 130 | + intel) GPU_BACKEND="intel" ;; |
| 131 | + cpu) GPU_BACKEND="cpu" ;; |
| 132 | + apple) GPU_BACKEND="apple" ;; |
| 133 | + *) GPU_BACKEND="nvidia" ;; |
| 134 | + esac |
| 135 | + [[ -n "${CAP_GPU_MEMORY_TYPE:-}" ]] && GPU_MEMORY_TYPE="${CAP_GPU_MEMORY_TYPE}" |
| 136 | + [[ -n "${CAP_GPU_NAME:-}" ]] && GPU_NAME="${CAP_GPU_NAME}" |
| 137 | + [[ -n "${CAP_GPU_VRAM_MB:-}" ]] && GPU_VRAM="${CAP_GPU_VRAM_MB}" |
| 138 | + [[ -n "${CAP_GPU_COUNT:-}" ]] && GPU_COUNT="${CAP_GPU_COUNT}" |
| 139 | + log "Capabilities override detection: backend=${GPU_BACKEND}, memory=${GPU_MEMORY_TYPE}, tier=${CAP_RECOMMENDED_TIER:-unknown}" |
| 140 | + fi |
| 141 | + |
| 142 | + if [[ "$GPU_BACKEND" == "amd" ]] && ! amd_gpu_runtime_devices_available; then |
| 143 | + _amd_missing_devices="$(amd_gpu_missing_devices_csv)" |
| 144 | + if [[ "${GPU_BACKEND_FORCED:-false}" == "true" ]]; then |
| 145 | + ai_bad "GPU_BACKEND=amd was explicitly requested, but required AMD device nodes are missing." |
| 146 | + show_amd_gpu_device_guidance "$_amd_missing_devices" |
| 147 | + error "Cannot continue with AMD GPU mode until device passthrough is available." |
| 148 | + elif ds_in_container; then |
| 149 | + ai_warn "AMD hardware was detected, but this container cannot access the AMD GPU devices." |
| 150 | + show_amd_gpu_device_guidance "$_amd_missing_devices" |
| 151 | + apply_cpu_gpu_fallback "Falling back to CPU mode because AMD GPU passthrough is unavailable in this container." |
| 152 | + else |
| 153 | + ai_warn "AMD GPU runtime devices not ready yet: ${_amd_missing_devices:-unknown}" |
| 154 | + ai "Continuing for now; AMD tuning will try to load kernel modules before services start." |
| 155 | + fi |
| 156 | + fi |
127 | 157 | fi |
128 | 158 |
|
129 | 159 | BACKEND_ID="$GPU_BACKEND" |
|
155 | 185 | #----------------------------------------------------------------------------- |
156 | 186 | # If detect_gpu found no working GPU, check if it's a fixable driver/Secure Boot issue |
157 | 187 | # (Only for NVIDIA — AMD APU is handled above) |
158 | | -if [[ $GPU_COUNT -eq 0 && "$GPU_BACKEND" != "amd" ]] && ! $DRY_RUN; then |
| 188 | +if [[ "${GPU_BACKEND_FORCED_CPU:-false}" != "true" && $GPU_COUNT -eq 0 && "$GPU_BACKEND" != "amd" ]] && ! $DRY_RUN; then |
159 | 189 | fix_nvidia_secure_boot || true |
160 | 190 | fi |
161 | 191 |
|
|
0 commit comments