diff --git a/.github/workflows/p2p-gpu.yml b/.github/workflows/p2p-gpu.yml
new file mode 100644
index 000000000..cab86577c
--- /dev/null
+++ b/.github/workflows/p2p-gpu.yml
@@ -0,0 +1,37 @@
+name: P2P GPU checks
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "dream-server/installers/p2p-gpu/**"
+      - ".github/workflows/p2p-gpu.yml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "dream-server/installers/p2p-gpu/**"
+      - ".github/workflows/p2p-gpu.yml"
+
+permissions:
+  contents: read
+
+jobs:
+  p2p-gpu:
+    name: P2P GPU syntax + regression
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+
+      - name: Bash syntax check (p2p-gpu)
+        run: |
+          shfiles=$(find dream-server/installers/p2p-gpu -name '*.sh' -type f)
+          if [ -z "$shfiles" ]; then
+            echo "No .sh files found under dream-server/installers/p2p-gpu"
+            exit 0
+          fi
+          echo "$shfiles" | xargs bash -n
+
+      - name: NVML mismatch regression
+        run: |
+          # Live Vast.ai + GPU validation is performed manually outside CI.
+          bash dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh
diff --git a/dream-server/installers/p2p-gpu/README.md b/dream-server/installers/p2p-gpu/README.md
new file mode 100644
index 000000000..372d5abee
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/README.md
@@ -0,0 +1,196 @@
+# P2P GPU Deploy — DreamServer on Peer-to-Peer GPU Marketplaces
+
+Production-hardened deployment of the full DreamServer AI stack on rented GPU instances from peer-to-peer compute marketplaces (Vast.ai tested; architecture is provider-agnostic).
+
+**One command. All bundled services. Any NVIDIA/AMD GPU or CPU-only instance.**
+
+Automatically handles 28 known P2P GPU environment issues: root user rejection, Docker socket permissions, CPU limit overflow, /tmp permissions, NVIDIA toolkit setup, NVML driver/library mismatch, multi-GPU support, SSH tunneling, package manager locks, and more. Includes built-in recovery commands, health checks, and model auto-swap capabilities.
+
+## What It Solves
+
+**The Problem:** Deploying DreamServer on rented GPU instances is fragile. Root-only environments, non-standard filesystem permissions, held package locks, missing GPU drivers, and provider-specific quirks cause silent failures during setup.
+
+**The Solution:** `setup.sh` is a battle-tested orchestrator that detects and fixes the known issues automatically. It handles permission escalation, creates a non-root `dream` user, manages Docker group access, installs missing NVIDIA/AMD toolkits, applies POSIX ACLs for multi-container file sharing, and starts all bundled services (discovered from extension manifests) with health checks. If setup partially completes, recovery commands bring the stack back online without reinstall.
+
+## Quick Start
+
+```bash
+# On your GPU instance (as root):
+bash setup.sh              # Full install (~10 min)
+bash setup.sh --status     # Health check
+bash setup.sh --info       # Show connection URLs and SSH tunnel commands
+bash setup.sh --teardown   # Stop all services
+```
+
+## Setup Guide
+
+- [Setup Tutorial_Video](https://drive.google.com/file/d/12CY9-KTyCsqRGtyaauqmvsupoh3jocBL/view?usp=sharing)
+- [Setup presentation slides](https://docs.google.com/presentation/d/1XbVNV1n04JiOyAIkA6bU5r5A9T7uBnLr/edit?usp=sharing)
+
+## Quick Recovery (If Phase 9 Fails)
+
+If setup reached "Starting services" but URLs are unreachable:
+
+```bash
+bash setup.sh --fix
+bash setup.sh --status
+bash setup.sh --info
+```
+
+This re-applies CPU caps, permissions, network fixes, restarts compose, and
+prints fresh access commands.
+
+On Windows, use the all-port tunnel from `--info` (it uses a safe local alias
+`58080 -> dashboard` plus direct localhost forwards for service ports).
+
+`--fix` regenerates reconnect scripts:
+- `connect-tunnel.sh` (Linux/macOS/WSL)
+- `connect-tunnel.ps1` (Windows PowerShell)
+
+## What It Does
+
+The setup script handles 28 known issues with P2P GPU environments:
+
+| # | Issue | Fix |
+|---|-------|-----|
+| 01 | Root user rejection | Creates non-root `dream` user |
+| 02 | Docker socket denied | Adds dream to docker group |
+| 03 | /tmp broken | Fixes permissions to 1777 |
+| 04 | CPU limit overflow | Auto-caps to actual core count |
+| 05 | n8n uid mismatch | Dynamic UID from compose.yaml |
+| 06 | dashboard-api write | ACL-based permission system |
+| 07 | comfyui models write | AMD/NVIDIA layout detection |
+| 08 | WEBUI_SECRET missing | Auto-generated secrets |
+| 09 | Dual directory confusion | Smart directory discovery |
+| 10 | Dashboard stuck Created | Auto-nudge on startup |
+| 11 | HuggingFace throttle | aria2c multi-threaded download |
+| 12 | NVIDIA toolkit missing | Auto-installs + configures |
+| 13 | Disk space insufficient | Pre-flight validation |
+| 14 | Compose v1 syntax | Auto-detects v1 vs v2 |
+| 15 | .env duplicates | Idempotent env_set() |
+| 16 | Port conflicts | Dynamic port discovery |
+| 17 | DNS resolution failure | Google/Cloudflare DNS fallback |
+| 18 | /dev/shm too small | Remount /dev/shm to 4GB |
+| 19 | Bootstrap model missing | Auto-downloads Qwen3-0.6B |
+| 20 | llama-server infinite hang | 45s diagnosis + OOM recovery |
+| 21 | No systemd | Host-agent background start |
+| 22 | OpenCode crash-loop | Auto-disable non-essential |
+| 23 | CUDA OOM on large models | Swap to smallest model |
+| 24 | ComfyUI infinite hang | Background download, don't block |
+| 25 | Installer hang | 10min cap on the installer run |
+| 26 | AMD GPU support | ROCm detection + compose overlay |
+| 27 | CPU-only fallback | Works without any GPU |
+| 28 | NVML driver/library mismatch | Detect + targeted repair (regression-tested) |
+
+## Architecture
+
+```
+p2p-gpu/
+├── setup.sh                    # Orchestrator — sources libs, runs phases
+├── config/
+│   └── service-hints.yaml      # p2p-gpu-only manifest overrides (proxy_mode, startup_behavior)
+├── lib/                        # Pure function libraries (no side effects)
+│   ├── constants.sh            # Paths, versions, colors, thresholds
+│   ├── logging.sh              # log/warn/err/step, cleanup trap, flock, dpkg-lock release
+│   ├── environment.sh          # .env management, GPU detection, HTTP polling
+│   ├── permissions.sh          # POSIX ACLs, setgid, UID-specific fixes
+│   ├── services.sh             # Manifest discovery, compose, startup
+│   ├── networking.sh           # Caddy proxy, SSH tunnel, Cloudflare
+│   ├── models.sh               # Model download, URL resolution, swap watcher
+│   ├── gpu-topology.sh         # Per-GPU enumeration, NVLink/PCIe topology, GPU↔service assignment
+│   └── compatibility.sh        # Whisper/TTS/ComfyUI/OpenClaw fixes
+├── phases/                     # Sequential install steps
+│   ├── 00-preflight.sh         # GPU/disk/Docker/DNS validation
+│   ├── 01-dependencies.sh      # System package installation
+│   ├── 02-user-setup.sh        # Create dream user + groups
+│   ├── 03-repository.sh        # Clone DreamServer repo
+│   ├── 04-installer.sh         # Run DreamServer installer (with timeout)
+│   ├── 05-post-install.sh      # Apply fixes, locate working directory
+│   ├── 06-bootstrap-model.sh   # Ensure usable GGUF model exists
+│   ├── 07-model-optimize.sh    # Resume/restart downloads with aria2c
+│   ├── 08-vastai-quirks.sh     # Provider-specific environment fixes
+│   ├── 09-services.sh          # Start containers + health monitoring
+│   ├── 10-voice-stack.sh       # TTS/STT model readiness gates
+│   ├── 11-access-layer.sh      # Caddy proxy + Cloudflare tunnel + SSH
+│   └── 12-summary.sh           # Print access info
+├── subcommands/                # Alternative entry points
+│   ├── teardown.sh             # Stop all services
+│   ├── status.sh               # Health check dashboard
+│   ├── resume.sh               # Quick restart after SSH drop
+│   ├── fix.sh                  # Apply fixes without reinstall
+│   └── info.sh                 # Show connection URLs
+└── tests/
+    └── test-nvml-mismatch.sh   # NVML mismatch repair-path regression (run in CI)
+```
+
+## Design Principles
+
+Aligned with DreamServer's [CLAUDE.md](../../../CLAUDE.md):
+
+- **Let It Crash** — `set -euo pipefail` throughout; errors are fatal unless a failure is explicitly tolerated with `|| warn`. Non-essential services degrade independently, so a working dashboard with a degraded ComfyUI beats a dead stack on an instance you're paying for.
+- **KISS** — readable over clever; one function, one job.
+- **Functional core, imperative shell** — `lib/` holds pure helpers; `phases/` is the imperative shell that runs on source.
+- **Manifest-driven** — services are discovered from extension manifests, never a hardcoded list.
+- **PID-file process tracking** — background jobs (model downloads, swap watcher, tunnels) are tracked by PID file under `/var/run/dreamserver-p2p-gpu/` and stopped by PID.
+- **ACL-primary permissions** — shared-data directories use setgid + POSIX ACLs as their only sharing mechanism. Failures on those paths abort the install (`exit 1`) rather than degrading to world-writable permissions; per-extension ACLs are applied independently so one extension's failure doesn't block the rest.
+
+## Commands
+
+| Command | Purpose |
+|---------|---------|
+| `bash setup.sh` | Full install (first time or re-install) |
+| `bash setup.sh --resume` | Quick restart — re-apply fixes + start services |
+| `bash setup.sh --status` | Health check — GPU, containers, ports |
+| `bash setup.sh --info` | Show connection URLs and SSH tunnel commands |
+| `bash setup.sh --fix` | Apply latest fixes without full reinstall |
+| `bash setup.sh --teardown` | Stop all services |
+| `bash setup.sh --dry-run` | Preview what would happen without making changes |
+
+## Model Download and Auto-Swap
+
+- Setup starts quickly on a small model, downloads the GPU-tier model in background, then auto-swaps when ready.
+- Swap updates both `GGUF_FILE` and `LLM_MODEL`, then restarts dependent services.
+- Dashboard model downloads (`/models` page) require the Dream host agent; setup auto-starts it during service startup.
+
+```bash
+MODEL="Qwen3-30B-A3B-Q4_K_M.gguf"; DS_DIR="${DS_DIR:-/home/dream/dream-server}"; LLM_MODEL="$(echo "$MODEL" | sed -E 's/\.(gguf|GGUF)$//' | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' | tr '[:upper:]' '[:lower:]')"; cd "$DS_DIR" && sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${MODEL}|" .env && { grep -q '^LLM_MODEL=' .env && sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${LLM_MODEL}|" .env || echo "LLM_MODEL=${LLM_MODEL}" >> .env; } && docker compose $(cat .compose-flags 2>/dev/null) up -d llama-server && for c in dream-dreamforge dream-openclaw dream-dashboard-api dream-webui; do docker ps --format '{{.Names}}' | grep -qx "$c" && docker restart "$c" >/dev/null || echo "[warn] ${c} restart failed (non-fatal)" >&2; done
+```
+
+```bash
+tail -f /home/dream/dream-server/logs/aria2c-download.log
+```
+
+```bash
+# If Dashboard shows "Failed to start download"
+su - dream -c 'cd /home/dream/dream-server && DREAM_HOME=/home/dream/dream-server ./dream-cli agent start'
+```
+
+## Provider Support
+
+Currently tested on **Vast.ai**. The architecture is provider-agnostic:
+- GPU detection works for any NVIDIA/AMD/CPU-only instance
+- Docker + compose requirements are standard
+- Provider-specific quirks isolated in `phases/08-vastai-quirks.sh`
+
+The active provider is selected by `PROVIDER_NAME` (override with `P2P_GPU_PROVIDER`
+before running). To add a new provider, create `phases/08-<provider>-quirks.sh` with
+provider-specific fixes.
+
+## Security
+
+- `.env` files created with `0660` mode, owned `dream:dream` — readable by the `dream` group the containers run under, never world-readable
+- SSH private keys forced to `0600`
+- Background process PIDs tracked in `/var/run/dreamserver-p2p-gpu/`
+- Cloudflare tokens passed via environment variables (not CLI args)
+- `cloudflared` binary verified against the upstream SHA256 when the checksum file is reachable; on mismatch the tunnel is skipped
+- POSIX ACLs required; world-writable permissions are never used
+- Multi-UID directories documented with reasons for broader access
+
+## Related
+
+- [`../../../README.md`](../../../README.md) — DreamServer project overview
+- [`../../../CLAUDE.md`](../../../CLAUDE.md) — design philosophy and error-handling rules
+- [`../../docs/INSTALLER-ARCHITECTURE.md`](../../docs/INSTALLER-ARCHITECTURE.md) — installer module map and header convention
+- [`../../docs/EXTENSIONS.md`](../../docs/EXTENSIONS.md) — service/extension manifest model
+- [`../../CONTRIBUTING.md`](../../CONTRIBUTING.md) — contribution and validation guide
+- [`../../../SECURITY.md`](../../../SECURITY.md) — security policy and disclosure
\ No newline at end of file
diff --git a/dream-server/installers/p2p-gpu/config/service-hints.yaml b/dream-server/installers/p2p-gpu/config/service-hints.yaml
new file mode 100644
index 000000000..c68b3c49f
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/config/service-hints.yaml
@@ -0,0 +1,23 @@
+# P2P GPU deployment hints — service-specific overrides for the setup script.
+# These supplement manifest.yaml defaults ONLY within the p2p-gpu context.
+# When upstream adopts proxy_mode/startup_behavior as first-class manifest
+# fields, delete this file and remove the hints merge in lib/services.sh.
+
+comfyui:
+  proxy_mode: root
+  startup_behavior: heavy
+
+dashboard:
+  proxy_mode: root
+
+open-webui:
+  proxy_mode: root
+
+perplexica:
+  startup_behavior: heavy
+
+tts:
+  startup_behavior: heavy
+
+whisper:
+  startup_behavior: heavy
diff --git a/dream-server/installers/p2p-gpu/lib/compatibility.sh b/dream-server/installers/p2p-gpu/lib/compatibility.sh
new file mode 100644
index 000000000..c0838c7f3
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/compatibility.sh
@@ -0,0 +1,391 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Compatibility Fixes
+# ============================================================================
+# Part of: p2p-gpu/lib/
+# Purpose: Service-specific compatibility patches for Whisper, TTS, ComfyUI,
+#          and OpenClaw running on Vast.ai instances
+#
+# Expects: LOGFILE, log(), warn(), env_get(), wait_for_http()
+# Provides: ensure_whisper_ui_compatibility(), ensure_webui_stt_model_alignment(),
+#           map_whisper_model_id(), ensure_whisper_asr_model(), ensure_tts_model_ready(),
+#           fix_comfyui_permissions(), comfyui_preload_models(),
+#           patch_openclaw_inject_token_runtime()
+#
+# Modder notes:
+#   These are narrow fixes for known Vast.ai failure modes. Each function
+#   is idempotent and safe to re-run.
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# Fix Whisper UI internal API connectivity + entrypoint executable bit
+ensure_whisper_ui_compatibility() {
+  local ds_dir="$1"
+  local whisper_compose="${ds_dir}/extensions/services/whisper/compose.yaml"
+  local whisper_entrypoint="${ds_dir}/extensions/services/whisper/docker-entrypoint.sh"
+
+  if [[ -f "$whisper_entrypoint" ]]; then
+    # [NON-FATAL: whisper] Entry point permissions only affect Whisper UI.
+    chmod 755 "$whisper_entrypoint" || warn "whisper entrypoint chmod failed (non-fatal)"
+  fi
+
+  [[ ! -f "$whisper_compose" ]] && return 0
+
+  if ! grep -q 'LOOPBACK_HOST_URL=' "$whisper_compose"; then
+    if grep -q 'WHISPER__TTL=' "$whisper_compose"; then
+      sed -i '/WHISPER__TTL=/a\      - LOOPBACK_HOST_URL=http://127.0.0.1:8000\n      - CHAT_COMPLETION_BASE_URL=http://llama-server:8080/v1\n      - CHAT_COMPLETION_API_KEY=cant-be-empty' \
+        "$whisper_compose"
+      log "Injected Whisper UI loopback compatibility env"
+    else
+      warn "Whisper compose env block not found — skipped loopback injection"
+    fi
+  fi
+}
+
+# Keep Open WebUI STT model aligned with the Whisper model we bootstrap.
+# Fixes mismatch where WebUI requests a model that Whisper does not have.
+ensure_webui_stt_model_alignment() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local nvidia_overlay="${ds_dir}/docker-compose.nvidia.yml"
+  [[ ! -f "$nvidia_overlay" ]] && return 0
+
+  local whisper_cfg model_id current
+  whisper_cfg="$(env_get "$env_file" "WHISPER_MODEL")"
+  model_id="$(map_whisper_model_id "$whisper_cfg")"
+  [[ -z "$model_id" ]] && model_id="Systran/faster-whisper-base"
+
+  current=$(grep -E 'AUDIO_STT_MODEL:' "$nvidia_overlay" | head -1 | sed -E 's/.*AUDIO_STT_MODEL:\s*"?(.*)"?/\1/' || echo "")
+  [[ "$current" == "$model_id" ]] && return 0
+
+  # Preserve existing indentation to avoid corrupting YAML structure.
+  sed -i -E "s|^([[:space:]]*)AUDIO_STT_MODEL:.*|\1AUDIO_STT_MODEL: \"${model_id}\"|" "$nvidia_overlay"
+  log "Aligned Open WebUI STT model to ${model_id}"
+}
+
+# Map friendly WHISPER_MODEL values to Speaches-compatible model IDs
+map_whisper_model_id() {
+  local raw="$1"
+  case "${raw,,}" in
+    tiny|tiny.en)                echo "Systran/faster-whisper-tiny" ;;
+    base|base.en|"")             echo "Systran/faster-whisper-base" ;;
+    small|small.en)              echo "Systran/faster-whisper-small" ;;
+    medium|medium.en)            echo "Systran/faster-whisper-medium" ;;
+    large|large-v2|large-v3)     echo "Systran/faster-whisper-large-v3" ;;
+    turbo|large-v3-turbo)        echo "deepdml/faster-whisper-large-v3-turbo-ct2" ;;
+    */*)                         echo "$raw" ;;
+    *)                           echo "Systran/faster-whisper-base" ;;
+  esac
+}
+
+# Ensure at least one ASR model is loaded in Whisper
+ensure_whisper_asr_model() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local whisper_port
+  whisper_port="$(env_get "$env_file" "WHISPER_PORT")"
+  whisper_port="${whisper_port:-9000}"
+
+  if ! wait_for_http "http://127.0.0.1:${whisper_port}/health" 120 4; then
+    warn "Whisper not reachable on port ${whisper_port} — skipping ASR bootstrap"
+    return 0
+  fi
+
+  local asr_count
+  asr_count=$(curl -sf --max-time 12 \
+    "http://127.0.0.1:${whisper_port}/v1/models?task=automatic-speech-recognition" \
+    | jq -r '.data | length' || echo 0)
+
+  if [[ "$asr_count" =~ ^[0-9]+$ ]] && [[ "$asr_count" -gt 0 ]]; then
+    log "Whisper ASR models already available (${asr_count})"
+    return 0
+  fi
+
+  local whisper_cfg model_id encoded_model
+  whisper_cfg="$(env_get "$env_file" "WHISPER_MODEL")"
+  model_id="$(map_whisper_model_id "$whisper_cfg")"
+  encoded_model="${model_id//\//%2F}"
+
+  warn "No ASR models — bootstrapping ${model_id}"
+  curl -sf -X POST --max-time 30 \
+    "http://127.0.0.1:${whisper_port}/v1/models/${encoded_model}" > /dev/null \
+    || { warn "Could not trigger Whisper model download for ${model_id}"; return 0; }
+
+  _wait_for_asr "$whisper_port"
+}
+
+_wait_for_asr() {
+  local whisper_port="$1"
+  local waited=0
+  while [[ $waited -lt 180 ]]; do
+    local asr_count
+    asr_count=$(curl -sf --max-time 12 \
+      "http://127.0.0.1:${whisper_port}/v1/models?task=automatic-speech-recognition" \
+      | jq -r '.data | length' || echo 0)
+    if [[ "$asr_count" =~ ^[0-9]+$ ]] && [[ "$asr_count" -gt 0 ]]; then
+      log "Whisper ASR model bootstrap complete (${asr_count} model(s))"
+      return 0
+    fi
+    sleep 6
+    waited=$((waited + 6))
+  done
+  warn "Whisper model download started but not ready — will appear shortly"
+}
+
+# Wait for Kokoro TTS to load at least one voice model
+ensure_tts_model_ready() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local tts_port
+  tts_port="$(env_get "$env_file" "TTS_PORT")"
+  tts_port="${tts_port:-8880}"
+
+  if ! docker ps --format '{{.Names}}' | grep -q 'dream-tts'; then
+    return 0
+  fi
+
+  if ! wait_for_http "http://127.0.0.1:${tts_port}/health" 90 4; then
+    warn "Kokoro TTS not reachable on port ${tts_port} — skipping"
+    return 0
+  fi
+
+  local voice_count
+  voice_count=$(curl -sf --max-time 10 "http://127.0.0.1:${tts_port}/v1/audio/voices" \
+    | jq -r 'if type == "array" then length elif .voices then (.voices | length) else 0 end' \
+    || echo 0)
+
+  if [[ "$voice_count" =~ ^[0-9]+$ ]] && [[ "$voice_count" -gt 0 ]]; then
+    log "Kokoro TTS ready (${voice_count} voice(s))"
+    return 0
+  fi
+
+  warn "Kokoro TTS starting — waiting for voice model..."
+  _wait_for_tts "$tts_port"
+}
+
+_wait_for_tts() {
+  local tts_port="$1"
+  local waited=0
+  while [[ $waited -lt 90 ]]; do
+    local voice_count
+    voice_count=$(curl -sf --max-time 10 "http://127.0.0.1:${tts_port}/v1/models" \
+      | jq -r '.data | length' || echo 0)
+    if [[ "$voice_count" =~ ^[0-9]+$ ]] && [[ "$voice_count" -gt 0 ]]; then
+      log "Kokoro TTS model loaded (${voice_count} model(s))"
+      return 0
+    fi
+    sleep 6
+    waited=$((waited + 6))
+  done
+  warn "Kokoro TTS model still loading — will be available shortly"
+}
+
+# Fix ComfyUI permissions for AMD vs NVIDIA mount layouts
+fix_comfyui_permissions() {
+  local data_dir="$1"
+  local gpu_backend="${2:-nvidia}"
+
+  local dirs
+  if [[ "$gpu_backend" == "amd" ]]; then
+    dirs=("${data_dir}/comfyui/ComfyUI/models"
+          "${data_dir}/comfyui/ComfyUI/output"
+          "${data_dir}/comfyui/ComfyUI/input"
+          "${data_dir}/comfyui/ComfyUI/custom_nodes")
+  else
+    dirs=("${data_dir}/comfyui/models"
+          "${data_dir}/comfyui/output"
+          "${data_dir}/comfyui/input"
+          "${data_dir}/comfyui/workflows")
+  fi
+
+  for d in "${dirs[@]}"; do
+    mkdir -p "$d" || { warn "comfyui mkdir failed on ${d} (non-fatal)"; continue; }
+    # [NON-FATAL: comfyui] ComfyUI will fail its own healthcheck if ACLs remain broken.
+    chmod 2775 "$d" && setfacl -R -d -m "u::rwx,u:$(id -u comfyui 2>>"$LOGFILE" || echo 1000):rwx,g::rwx,o::rx" "$d" \
+      || warn "comfyui ACL failed on ${d} (non-fatal)"
+  done
+}
+
+# Download user-specified ComfyUI models from COMFYUI_EXTRA_MODELS env var
+comfyui_preload_models() {
+  local ds_dir="$1"
+  local gpu_backend="${2:-nvidia}"
+  local env_file="${ds_dir}/.env"
+  local data_dir="${ds_dir}/data"
+
+  local extra_models
+  extra_models="$(env_get "$env_file" "COMFYUI_EXTRA_MODELS")"
+  [[ -z "$extra_models" ]] && return 0
+
+  local models_root
+  if [[ "$gpu_backend" == "amd" ]]; then
+    models_root="${data_dir}/comfyui/ComfyUI/models"
+  else
+    models_root="${data_dir}/comfyui/models"
+  fi
+  mkdir -p "$models_root"
+
+  log "Processing ComfyUI extra models..."
+  echo "$extra_models" | tr ';' '\n' | while IFS='|' read -r url target; do
+    url=$(echo "$url" | xargs)
+    target=$(echo "$target" | xargs)
+    [[ -z "$url" || -z "$target" ]] && continue
+    _download_comfyui_model "$models_root" "$url" "$target"
+  done
+
+  apply_data_acl "$models_root"
+  log "ComfyUI model preload complete"
+}
+
+_download_comfyui_model() {
+  local models_root="$1" url="$2" target="$3"
+  local dest="${models_root}/${target}"
+  local dest_dir
+  dest_dir="$(dirname "$dest")"
+  mkdir -p "$dest_dir"
+
+  [[ -f "$dest" ]] && { log "  Already exists: ${target}"; return 0; }
+
+  log "  Downloading: ${target}..."
+  if command -v aria2c &>/dev/null; then
+    # [NON-FATAL: comfyui] Optional extra model download failures should not block install.
+    aria2c -x 4 -s 4 -k 5M --file-allocation=none --console-log-level=warn \
+      -d "$dest_dir" -o "$(basename "$dest")" "$url" 2>&1 | tail -3 \
+      || warn "  Failed to download ${target} (non-fatal)"
+  else
+    # [NON-FATAL: comfyui] Optional extra model download failures should not block install.
+    curl -L --progress-bar -o "$dest" "$url" \
+      || warn "  Failed to download ${target} (non-fatal)"
+  fi
+}
+
+# Patch OpenClaw's inject-token.js for model reference compatibility
+patch_openclaw_inject_token_runtime() {
+  local ds_dir="$1"
+  local target="${ds_dir}/config/openclaw/inject-token.js"
+
+  [[ ! -f "$target" ]] && return 0
+  if ! command -v perl &>/dev/null; then
+    warn "perl missing — cannot patch OpenClaw injector"
+    return 0
+  fi
+
+  # Already patched? Keep idempotent.
+  if grep -q "const providerMap = config.models?.providers || config.providers || null;" "$target" \
+    && grep -q "firstModel.name = LLM_MODEL;" "$target" \
+    && grep -q "updated legacy agent model refs ->" "$target"; then
+    log "OpenClaw injector patch already present: ${target}"
+    return 0
+  fi
+
+  local before_hash
+  before_hash=$(sha256sum "$target" | awk '{print $1}' || echo "")
+
+  local subs
+  subs=$(perl -0777 -i - "$target" <<'PERL'
+my $replacement = <<'JS';
+  // Fix model references to match what llama-server actually serves
+  if (LLM_MODEL) {
+    const providerMap = config.models?.providers || config.providers || null;
+    const providerName = providerMap ? Object.keys(providerMap)[0] : null;
+
+    if (providerName && providerMap[providerName]) {
+      const provider = providerMap[providerName];
+      const ollamaUrl = process.env.OLLAMA_URL || '';
+      const litellmKey = process.env.LITELLM_KEY || '';
+      if (ollamaUrl) {
+        const newBase = ollamaUrl.replace(/\/$/, '') + '/v1';
+        if (provider.baseUrl !== newBase) {
+          console.log(`[inject-token] updated provider baseUrl: ${provider.baseUrl} -> ${newBase}`);
+          provider.baseUrl = newBase;
+        }
+        if (litellmKey && provider.apiKey !== litellmKey) {
+          provider.apiKey = litellmKey;
+          console.log(`[inject-token] updated provider apiKey from env`);
+        }
+      }
+
+      if (Array.isArray(provider.models) && provider.models.length > 0) {
+        const firstModel = provider.models[0];
+        if (firstModel && typeof firstModel === 'object') {
+          const oldValue = firstModel.name || firstModel.id || '<unset>';
+          if (firstModel.name !== LLM_MODEL || firstModel.id !== LLM_MODEL) {
+            firstModel.name = LLM_MODEL;
+            firstModel.id = LLM_MODEL;
+            console.log(`[inject-token] updated provider model: ${oldValue} -> ${LLM_MODEL}`);
+          }
+        }
+      }
+    }
+
+    if (config.agents?.defaults) {
+      const d = config.agents.defaults;
+      const fullOld = d.model?.primary || '';
+      if (fullOld && providerName) {
+        const fullNew = `${providerName}/${LLM_MODEL}`;
+        if (fullOld !== fullNew) {
+          d.model = { primary: fullNew };
+          d.models = { [fullNew]: {} };
+          if (d.subagents) d.subagents.model = fullNew;
+          console.log(`[inject-token] updated agent model refs: ${fullOld} -> ${fullNew}`);
+        }
+      }
+    }
+
+    if (config.agent && providerName) {
+      const fullNew = `${providerName}/${LLM_MODEL}`;
+      if (config.agent.model !== fullNew) {
+        config.agent.model = fullNew;
+        if (config.subagent) config.subagent.model = fullNew;
+        console.log(`[inject-token] updated legacy agent model refs -> ${fullNew}`);
+      }
+    }
+  }
+
+  // Override LLM baseUrl for Token Spy monitoring (if OPENCLAW_LLM_URL is set)
+JS
+
+my $n = s{
+\Q  // Fix model references to match what llama-server actually serves
+  if (LLM_MODEL) {\E
+.*?
+\Q  }
+
+  // Override LLM baseUrl for Token Spy monitoring (if OPENCLAW_LLM_URL is set)\E
+}{$replacement}sx;
+
+print $n;
+PERL
+)
+
+  _verify_openclaw_patch "$target" "$before_hash" "${subs:-0}"
+}
+
+_verify_openclaw_patch() {
+  local target="$1" before_hash="$2" subs="$3"
+
+  if [[ "$subs" -eq 0 ]]; then
+    if grep -q "const providerMap = config.models?.providers || config.providers || null;" "$target"; then
+      log "OpenClaw injector patch already present: ${target}"
+    else
+      warn "OpenClaw injector patch pattern not found in ${target} — leaving unchanged"
+    fi
+    return 0
+  fi
+
+  if grep -q "const providerMap = config.models?.providers || config.providers || null;" "$target" \
+    && grep -q "firstModel.name = LLM_MODEL;" "$target"; then
+    local after_hash
+    after_hash=$(sha256sum "$target" | awk '{print $1}' || echo "")
+    if [[ "$before_hash" != "$after_hash" ]]; then
+      log "Patched OpenClaw injector: ${target}"
+    else
+      log "OpenClaw injector patch already present: ${target}"
+    fi
+  else
+    warn "OpenClaw injector patch could not be verified: ${target}"
+  fi
+}
diff --git a/dream-server/installers/p2p-gpu/lib/constants.sh b/dream-server/installers/p2p-gpu/lib/constants.sh
new file mode 100644
index 000000000..18471cd51
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/constants.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Deploy Constants
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/lib/
+# Purpose: Readonly variables, colors, paths, thresholds
+#
+# Expects: (nothing — first file sourced)
+# Provides: P2P_GPU_VERSION, PROVIDER_NAME, DREAM_USER, DREAM_HOME,
+#           REPO_URL, REPO_BRANCH, MIN_DISK_GB, MIN_VRAM_MB,
+#           LOCKFILE, LOGFILE, PIDFILE_DIR, color codes
+#
+# Modder notes:
+#   All constants are readonly. Override via env vars BEFORE sourcing.
+#   Variables are consumed by other files sourced after this one.
+#   To add a new provider: create providers/<name>.sh, set PROVIDER_NAME.
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+# shellcheck disable=SC2034  # Variables used by sourcing scripts
+set -euo pipefail
+
+readonly P2P_GPU_VERSION="6.1.0"
+# Back-compat alias for phases that reference the old name
+readonly VASTAI_VERSION="$P2P_GPU_VERSION"
+readonly PROVIDER_NAME="${P2P_GPU_PROVIDER:-vastai}"
+readonly LOCKFILE="/tmp/dreamserver-p2p-gpu-setup.lock"
+readonly LOGFILE="/var/log/dreamserver-p2p-gpu-setup.log"
+readonly PIDFILE_DIR="/var/run/dreamserver-p2p-gpu"
+
+readonly DREAM_USER="dream"
+readonly DREAM_HOME="/home/${DREAM_USER}"
+readonly REPO_URL="https://github.com/Light-Heart-Labs/DreamServer.git"
+readonly REPO_BRANCH="main"
+readonly MIN_DISK_GB=40
+readonly MIN_VRAM_MB=8000
+readonly INSTALLER_TIMEOUT="${INSTALLER_TIMEOUT:-600}"
+readonly MULTIGPU_MIN_GPUS=2
+
+# ── Colors ──────────────────────────────────────────────────────────────────
+readonly RED='\033[0;31m'
+readonly GREEN='\033[0;32m'
+readonly YELLOW='\033[1;33m'
+readonly CYAN='\033[0;36m'
+readonly BOLD='\033[1m'
+readonly DIM='\033[2m'
+readonly NC='\033[0m'
diff --git a/dream-server/installers/p2p-gpu/lib/environment.sh b/dream-server/installers/p2p-gpu/lib/environment.sh
new file mode 100644
index 000000000..7b98516c7
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/environment.sh
@@ -0,0 +1,908 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Environment Helpers
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/lib/
+# Purpose: .env management, port checks, directory discovery, CPU capping,
+#          ownership fixes, HTTP polling, GPU detection, post-install orchestrator
+#
+# Expects: DREAM_USER, DREAM_HOME, LOGFILE, log(), warn(), err()
+# Provides: env_set(), env_get(), port_in_use(), find_dream_dir(),
+#           ensure_dream_cli_command(),
+#           cap_cpu_in_yaml(), cap_cpu_in_files(), get_compose_cpu_ceiling(),
+#           compute_safe_cpu_cap(), fix_ownership(), wait_for_http(),
+#           detect_gpu(), _cap_context_for_vram(), apply_post_install_fixes()
+#
+# Modder notes:
+#   env_set is idempotent — safe to call multiple times with same key.
+#   env_set creates .env with 0660 mode to protect secrets and allow dream user access.
+#   find_dream_dir checks both expected DreamServer install paths.
+#   detect_gpu() is the single source of truth for GPU detection —
+#   call it once and reuse the result (avoid duplicate detection).
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# ── [FIX: env-perms] .env management with proper file permissions ───────────
+
+# Set a key in .env idempotently (no duplicates, preserves inode)
+# Creates with 0660 to protect secrets (WEBUI_SECRET, API keys, etc.) and allow dream user
+env_set() {
+  local file="$1" key="$2" value="$3"
+  if [[ ! -f "$file" ]]; then
+    install -m 0660 -o "${DREAM_USER:-root}" -g "${DREAM_USER:-root}" /dev/null "$file"
+  fi
+  if grep -q "^${key}=" "$file"; then
+    # Escape sed delimiter in value to prevent breakage
+    local escaped_value="${value//|/\\|}"
+    sed -i "s|^${key}=.*|${key}=${escaped_value}|" "$file"
+  else
+    echo "${key}=${value}" >> "$file"
+  fi
+}
+
+# Read a key from .env
+env_get() {
+  local file="$1" key="$2"
+  [[ ! -f "$file" ]] && return 0
+  grep "^${key}=" "$file" 2>>"$LOGFILE" | head -1 | cut -d= -f2- \
+    | sed 's/[[:space:]]#.*$//' | tr -d '"' | tr -d "'" || echo ""
+}
+
+# Check if a TCP port is in use
+port_in_use() {
+  local port="$1"
+  ss -tlnp 2>&1 | grep -q ":${port} "
+}
+
+# Locate the active dream-server working directory
+find_dream_dir() {
+  local candidate
+  # Prefer directory with both .env and compose (fully configured)
+  for candidate in "${DREAM_HOME}/dream-server" "${DREAM_HOME}/DreamServer/dream-server"; do
+    if [[ -f "${candidate}/.env" && -f "${candidate}/docker-compose.base.yml" ]]; then
+      echo "$candidate"
+      return 0
+    fi
+  done
+  # Fallback: any existing directory (partially configured)
+  for candidate in "${DREAM_HOME}/dream-server" "${DREAM_HOME}/DreamServer/dream-server"; do
+    if [[ -d "$candidate" ]]; then
+      echo "$candidate"
+      return 0
+    fi
+  done
+  return 1
+}
+
+# Install a stable `dream` command wrapper for root/non-root shells.
+ensure_dream_cli_command() {
+  local ds_dir="$1"
+  local cli_path="${ds_dir}/dream-cli"
+  local wrapper="/usr/local/bin/dream"
+
+  if [[ ! -x "$cli_path" ]]; then
+    warn "dream-cli not executable at ${cli_path} (skipping global dream command)"
+    return 0
+  fi
+
+  cat > "$wrapper" << EOF
+#!/usr/bin/env bash
+set -euo pipefail
+export DREAM_HOME="\${DREAM_HOME:-${ds_dir}}"
+cd "${ds_dir}" || exit 1
+exec "${cli_path}" "\$@"
+EOF
+  # [NON-FATAL: convenience] Missing wrapper only affects global dream alias.
+  chmod +x "$wrapper" || warn "chmod failed on ${wrapper} (non-fatal)"
+  log "Installed global dream command: ${wrapper}"
+}
+
+# Cap CPU values in one YAML file to max_cpu.
+# Handles any numeric form (N, N.M) with optional quotes. Values <= max_cpu
+# are left alone; values > max_cpu are lowered to max_cpu.
+_cap_cpu_in_yaml_file() {
+  local file="$1" max_cpu="$2"
+  [[ ! -f "$file" ]] && return 0
+  python3 - "$file" "$max_cpu" <<'PY'
+import re, sys
+path, cap = sys.argv[1], float(sys.argv[2])
+try:
+  with open(path, "r", encoding="utf-8") as fh:
+    src = fh.read()
+except OSError:
+  sys.exit(0)
+
+def parse_numeric(value):
+  raw = value.strip().strip("'\"")
+  if re.fullmatch(r"[0-9]+(?:\.[0-9]+)?", raw):
+    return float(raw)
+  m = re.fullmatch(r"\$\{[^:}]+:-([0-9]+(?:\.[0-9]+)?)\}", raw)
+  if m:
+    return float(m.group(1))
+  return None
+
+def repl(m):
+  indent, rhs, comment = m.group(1), m.group(2).strip(), m.group(3) or ""
+  q = "'"
+  if rhs[:1] in ("'", '"'):
+    q = rhs[0]
+
+  numeric = parse_numeric(rhs)
+  needs_cap = ("${" in rhs) or (numeric is None) or (numeric > cap)
+  if needs_cap:
+    return f"{indent}cpus: {q}{cap:g}{q}{comment}"
+  return m.group(0)
+
+pat = re.compile(r"^(\s*)cpus:\s*([^#\n]+?)(\s+#.*)?$", re.M)
+new = pat.sub(repl, src)
+if new != src:
+  with open(path, "w", encoding="utf-8") as fh:
+    fh.write(new)
+PY
+}
+
+# Cap CPU values in all YAML files under a directory tree.
+cap_cpu_in_yaml() {
+  local dir="$1" max_cpu="$2"
+  while IFS= read -r -d '' f; do
+    _cap_cpu_in_yaml_file "$f" "$max_cpu"
+  done < <(find "$dir" \( -name "*.yml" -o -name "*.yaml" \) -type f -print0)
+  return 0
+}
+
+# Cap CPU values in a specific list of YAML files.
+cap_cpu_in_files() {
+  local max_cpu="$1"
+  shift
+  local f
+  for f in "$@"; do
+    _cap_cpu_in_yaml_file "$f" "$max_cpu"
+  done
+  return 0
+}
+
+# Return the CPU ceiling Docker can actually schedule, accounting for
+# container-level CPU quotas that can differ from nproc.
+get_compose_cpu_ceiling() {
+  local host_nproc docker_ncpu ceiling
+
+  host_nproc=$(nproc 2>>"$LOGFILE" || echo 1)
+  if [[ ! "$host_nproc" =~ ^[0-9]+$ ]] || [[ "$host_nproc" -lt 1 ]]; then
+    host_nproc=1
+  fi
+
+  ceiling="$host_nproc"
+  docker_ncpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "")
+  if [[ "$docker_ncpu" =~ ^[0-9]+$ ]] && [[ "$docker_ncpu" -gt 0 ]] && [[ "$docker_ncpu" -lt "$ceiling" ]]; then
+    ceiling="$docker_ncpu"
+  fi
+
+  echo "$ceiling"
+}
+
+# Compute a safe cpus: cap value with one-core headroom.
+# Optional arg 1: hard ceiling discovered from daemon error output.
+compute_safe_cpu_cap() {
+  local forced_ceiling="${1:-}"
+  local ceiling
+
+  ceiling=$(get_compose_cpu_ceiling)
+  if [[ "$forced_ceiling" =~ ^[0-9]+$ ]] && [[ "$forced_ceiling" -gt 0 ]] && [[ "$forced_ceiling" -lt "$ceiling" ]]; then
+    ceiling="$forced_ceiling"
+  fi
+
+  if [[ "$ceiling" -gt 1 ]]; then
+    echo $((ceiling - 1))
+  else
+    echo 1
+  fi
+}
+
+# Fix ownership recursively (unconditional to catch nested root-owned files)
+fix_ownership() {
+  local dir="$1" user="$2" group="${3:-$2}"
+  [[ ! -d "$dir" ]] && return 0
+  # Always apply chown recursively to fix root-owned files inside target-owned directories
+  # chown may fail on NFS mounts or in containers without CAP_CHOWN
+  if ! chown -R "${user}:${group}" "$dir" 2>>"$LOGFILE"; then
+    warn "chown failed on ${dir} (non-fatal; host may block ownership changes)"
+  fi
+}
+
+# Wait for a URL to return HTTP 200
+wait_for_http() {
+  local url="$1" timeout="${2:-60}" interval="${3:-5}"
+  local elapsed=0
+  while [[ $elapsed -lt $timeout ]]; do
+    if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
+      return 0
+    fi
+    sleep "$interval"
+    elapsed=$((elapsed + interval))
+  done
+  return 1
+}
+
+# ── [FIX: gpu-dedup] Single source of truth for GPU detection ───────────────
+# Sets GPU_BACKEND, GPU_NAME, GPU_VRAM, GPU_COUNT as globals.
+# Call once in preflight; all other code reads these variables.
+detect_gpu() {
+  GPU_BACKEND="cpu"
+  GPU_NAME="none"
+  GPU_VRAM="0"
+  GPU_COUNT=0
+  GPU_TOTAL_VRAM=0
+
+  if command -v nvidia-smi &>/dev/null && nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null 2>&1; then
+    GPU_BACKEND="nvidia"
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | sed -n '1p' | xargs)
+    GPU_VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE" | sed -n '1p' | xargs)
+    GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | wc -l)
+    GPU_TOTAL_VRAM=0
+    while read -r v; do GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + v )); done \
+      < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE")
+    if [[ $GPU_TOTAL_VRAM -eq 0 ]]; then GPU_TOTAL_VRAM=$GPU_VRAM; fi
+
+  elif command -v rocm-smi &>/dev/null || [[ -e /dev/kfd ]]; then
+    GPU_BACKEND="amd"
+    GPU_NAME=$(rocm-smi --showproductname 2>>"$LOGFILE" | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU")
+    GPU_VRAM=$(rocm-smi --showmeminfo vram 2>>"$LOGFILE" | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0")
+    # Convert bytes to MiB
+    if [[ "${GPU_VRAM:-0}" -gt 1000000 ]]; then
+      GPU_VRAM=$(( GPU_VRAM / 1048576 ))
+    fi
+    GPU_COUNT=$(rocm-smi --showid 2>>"$LOGFILE" | grep -c 'GPU\[' || echo 1)
+    if [[ $GPU_COUNT -ge 2 ]]; then
+      GPU_TOTAL_VRAM=$(( GPU_VRAM * GPU_COUNT ))  # rocm-smi per-device sum
+    else
+      GPU_TOTAL_VRAM=$GPU_VRAM
+    fi
+  fi
+
+  # Pin packages after successful detection to prevent future mismatches
+  if [[ "$GPU_BACKEND" == "nvidia" ]]; then
+    _pin_nvidia_packages
+  fi
+}
+
+# Lightweight backend-only detection (for subcommands that don't need full GPU info)
+detect_gpu_backend() {
+  if command -v nvidia-smi &>/dev/null && nvidia-smi &>/dev/null; then
+    echo "nvidia"
+  elif command -v rocm-smi &>/dev/null || [[ -e /dev/kfd ]]; then
+    echo "amd"
+  else
+    echo "cpu"
+  fi
+}
+
+_has_nvml_mismatch_signature() {
+  local output="${1:-}"
+  echo "$output" | grep -Eqi \
+    "driver/library version mismatch|failed to initialize nvml|nvidia-container-cli: initialization error: nvml error"
+}
+
+# ── [FIX: nvml-mismatch] NVIDIA driver/library version mismatch detection ────
+# Detects if host NVIDIA driver and container CUDA driver versions are misaligned.
+# Returns: 0 = matched, 1 = mismatched, 2 = couldn't detect
+# Outputs: diagnostics to stdout (host_driver=X.X container_cuda=Y.Y)
+detect_nvml_mismatch() {
+  local host_driver container_cuda docker_test_image="${1:-nvidia/cuda:12.4.1-base-ubuntu22.04}"
+  local test_timeout="${NVIDIA_DOCKER_TEST_TIMEOUT:-180}"
+  local host_probe_output host_probe_rc container_probe_output container_probe_rc
+
+  # Get host driver version
+  host_probe_output=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>&1) && host_probe_rc=0 || host_probe_rc=$?
+  [[ -n "$host_probe_output" ]] && printf '%s\n' "$host_probe_output" >> "$LOGFILE"
+
+  if [[ $host_probe_rc -eq 0 ]]; then
+    host_driver=$(echo "$host_probe_output" | head -1 | xargs || echo "")
+  elif _has_nvml_mismatch_signature "$host_probe_output"; then
+    log "NVIDIA host probe reported NVML driver/library mismatch"
+    return 1
+  else
+    host_driver=""
+  fi
+
+  if [[ -z "$host_driver" ]]; then
+    log "NVIDIA driver version detection failed (non-fatal)"
+    return 2
+  fi
+
+  # Get container CUDA driver compatibility version
+  container_probe_output=$(timeout --signal=TERM "$test_timeout" \
+    docker run --rm --gpus all "$docker_test_image" \
+    nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>&1) && container_probe_rc=0 || container_probe_rc=$?
+  [[ -n "$container_probe_output" ]] && printf '%s\n' "$container_probe_output" >> "$LOGFILE"
+
+  if [[ $container_probe_rc -eq 0 ]]; then
+    container_cuda=$(echo "$container_probe_output" | head -1 | xargs || echo "")
+  elif _has_nvml_mismatch_signature "$container_probe_output"; then
+    log "NVIDIA container probe reported NVML driver/library mismatch"
+    return 1
+  else
+    container_cuda=""
+  fi
+
+  if [[ -z "$container_cuda" ]]; then
+    log "Container CUDA driver detection failed (non-fatal)"
+    return 2
+  fi
+
+  # Compare major.minor versions (e.g., 535.104.05 → 535.104)
+  local host_major_minor container_major_minor
+  host_major_minor=$(echo "$host_driver" | cut -d. -f1,2)
+  container_major_minor=$(echo "$container_cuda" | cut -d. -f1,2)
+
+  log "NVIDIA driver mismatch check: host=${host_driver} (${host_major_minor}) vs container=${container_cuda} (${container_major_minor})"
+
+  if [[ "$host_major_minor" != "$container_major_minor" ]]; then
+    log "NVIDIA driver/library MISMATCH detected: host ${host_driver} != container ${container_cuda}"
+    return 1
+  fi
+
+  log "NVIDIA driver/library versions aligned (${host_major_minor})"
+  return 0
+}
+
+# ── [FIX: nvml-mismatch] Multi-strategy NVIDIA driver/library mismatch repair ──
+# Strategy 1: Reload kernel modules (fastest, no packages needed)
+# Strategy 2: Downgrade userspace libs to match kernel module version
+# Strategy 3: Upgrade kernel module to match userspace libs (original approach)
+# Non-fatal: logs warnings on failure but does not halt.
+repair_nvml_mismatch() {
+  local host_probe_output kernel_version="" lib_version="" initial_status post_repair_status
+
+  log "Attempting to repair NVIDIA driver/library mismatch..."
+
+  detect_nvml_mismatch && initial_status=0 || initial_status=$?
+  if [[ $initial_status -eq 0 ]]; then
+    log "No mismatch detected, skipping repair"
+    return 0
+  elif [[ $initial_status -eq 2 ]]; then
+    # [NON-FATAL: probe] NVML probe may fail on transient driver issues.
+    host_probe_output=$(nvidia-smi 2>&1) || warn "nvidia-smi probe failed (non-fatal)"
+    if _has_nvml_mismatch_signature "$host_probe_output"; then
+      warn "NVIDIA host probe reports driver/library mismatch — forcing repair attempt"
+    else
+      warn "Unable to detect NVIDIA driver/library mismatch state (skipping repair)"
+      return 1
+    fi
+  fi
+
+  # Get kernel module version (the version that's actually loaded)
+  if [[ -f /proc/driver/nvidia/version ]]; then
+    kernel_version="$(grep -oP 'Kernel Module\s+\K[0-9.]+' /proc/driver/nvidia/version || echo "")"
+  fi
+  if [[ -z "${kernel_version:-}" ]] && [[ -f /sys/module/nvidia/version ]]; then
+    kernel_version="$(cat /sys/module/nvidia/version 2>/dev/null || echo "")"  # stderr expected: file may not exist
+  fi
+
+  # Get NVML library version from nvidia-smi error output
+  lib_version="$(nvidia-smi 2>&1 | grep -oP 'NVML library version:\s*\K[0-9.]+' || echo "")"
+
+  if [[ -n "$kernel_version" ]]; then
+    log "Kernel module version: ${kernel_version}"
+  fi
+  if [[ -n "$lib_version" ]]; then
+    log "NVML library version: ${lib_version}"
+  fi
+
+  # ── Strategy 1: Kernel module reload ────────────────────────────────────
+  # Unload and reload NVIDIA modules so the userspace libs match what loads.
+  # This is the fastest fix and requires no package changes.
+  log "Strategy 1: Attempting kernel module reload..."
+
+  # Stop processes using the GPU before module unload
+  local gpu_containers
+  gpu_containers="$(docker ps --format '{{.Names}}' --filter 'label=com.docker.compose.project' 2>/dev/null | grep '^dream-' || echo "")"  # stderr expected: docker may not be running
+  if [[ -n "$gpu_containers" ]]; then
+    log "Stopping Docker containers before module reload..."
+    # [NON-FATAL: cleanup] Some containers may already be stopped or unresponsive.
+    docker stop $gpu_containers >> "$LOGFILE" 2>&1 || warn "Some containers failed to stop (non-fatal)"
+  fi
+
+  # Stop persistence daemon if running
+  if pgrep -x nvidia-persistenced >/dev/null 2>&1; then  # stderr expected: process check
+    log "Stopping nvidia-persistenced..."
+    # [NON-FATAL: cleanup] Persistence daemon may have already exited.
+    kill "$(pgrep -x nvidia-persistenced)" 2>/dev/null || warn "nvidia-persistenced not running (non-fatal)"  # stderr expected: may not exist
+    sleep 1
+  fi
+
+  # Kill any remaining GPU processes
+  if [[ -e /dev/nvidia0 ]]; then
+    local gpu_pids
+    gpu_pids="$(fuser /dev/nvidia* 2>/dev/null | xargs || echo "")"  # stderr expected: fuser probe
+    if [[ -n "$gpu_pids" ]]; then
+      log "Killing GPU processes: ${gpu_pids}"
+      # [NON-FATAL: cleanup] Some GPU processes may have already exited.
+      kill $gpu_pids 2>/dev/null || warn "some GPU processes already exited (non-fatal)"  # stderr expected: processes may have exited
+      sleep 2
+    fi
+  fi
+
+  # Unload modules in dependency order
+  local reload_success=false
+  # [NON-FATAL: cleanup] Module may not be loaded on this host.
+  rmmod nvidia_uvm 2>>"$LOGFILE" || warn "nvidia_uvm not loaded (non-fatal)"
+  # [NON-FATAL: cleanup] Module may not be loaded on this host.
+  rmmod nvidia_drm 2>>"$LOGFILE" || warn "nvidia_drm not loaded (non-fatal)"
+  # [NON-FATAL: cleanup] Module may not be loaded on this host.
+  rmmod nvidia_modeset 2>>"$LOGFILE" || warn "nvidia_modeset not loaded (non-fatal)"
+  if rmmod nvidia 2>>"$LOGFILE"; then
+    log "NVIDIA kernel modules unloaded successfully"
+    # Reload — nvidia-smi triggers automatic module load
+    sleep 1
+    if nvidia-smi &>/dev/null; then  # stderr expected: driver reinit
+      reload_success=true
+      log "NVIDIA kernel modules reloaded — nvidia-smi works"
+      nvidia-smi --query-gpu=driver_version,name --format=csv,noheader 2>>"$LOGFILE" | \
+        while read -r line; do log "  GPU: ${line}"; done
+    else
+      warn "nvidia-smi still fails after module reload"
+    fi
+  else
+    warn "Could not unload nvidia module (in use) — trying strategy 2"
+  fi
+
+  if [[ "$reload_success" == "true" ]]; then
+    # Verify with DKMS that module version matches kernel expectation
+    if command -v dkms &>/dev/null; then  # stderr expected: dkms check
+      local dkms_status
+      dkms_status="$(dkms status 2>/dev/null | grep nvidia || echo "")"  # stderr expected: dkms probe
+      if [[ -n "$dkms_status" ]]; then
+        log "DKMS status: ${dkms_status}"
+      fi
+    fi
+
+    # Restart Docker so it picks up the reloaded driver
+    # [NON-FATAL: docker] Docker may not be managed by systemctl on Vast.ai.
+    # [NON-FATAL: docker] Docker may not be managed by systemctl on Vast.ai.
+    systemctl restart docker 2>>"$LOGFILE" || service docker restart 2>>"$LOGFILE" \
+      || warn "Docker restart failed (non-fatal)"
+
+    # Verify CUDA compat libs aren't shadowing host driver inside containers
+    # (per NVIDIA NIM troubleshooting guide — bundled compat libs at
+    #  /usr/local/cuda-*/compat/ can override the host-mounted driver)
+    # [NON-FATAL: nvidia-ctk] Toolkit may already be configured or unavailable.
+    nvidia-ctk runtime configure --runtime=docker 2>>"$LOGFILE" \
+      || warn "nvidia-ctk configure failed (non-fatal)"
+
+    # Re-start any containers we stopped
+    if [[ -n "$gpu_containers" ]]; then
+      # [NON-FATAL: cleanup] Some containers may fail to restart on driver changes.
+      docker start $gpu_containers >> "$LOGFILE" 2>&1 || warn "Some containers failed to restart (non-fatal)"
+    fi
+
+    detect_nvml_mismatch && post_repair_status=0 || post_repair_status=$?
+    if [[ $post_repair_status -eq 0 ]]; then
+      _pin_nvidia_packages
+      return 0
+    elif [[ $post_repair_status -eq 1 ]]; then
+      warn "NVIDIA driver mismatch persists after module reload"
+    else
+      warn "Unable to verify NVIDIA driver/library mismatch after module reload"
+    fi
+  fi
+
+  # ── Strategy 2: Downgrade userspace to match kernel module ──────────────
+  # If we know the kernel module version, install matching userspace packages.
+  if [[ -n "${kernel_version:-}" ]]; then
+    log "Strategy 2: Aligning userspace libs to kernel module version ${kernel_version}..."
+    local driver_major
+    driver_major="$(echo "$kernel_version" | cut -d. -f1)"
+
+    if type -t _wait_for_dpkg_lock >/dev/null 2>&1; then
+      # [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout.
+      _wait_for_dpkg_lock 60 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle"
+    fi
+
+    # Try to install the exact matching driver version
+    if apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" update -qq 2>>"$LOGFILE" \
+      && apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" install -y -qq \
+        --allow-downgrades \
+        "nvidia-utils-${driver_major}=${kernel_version}-*" \
+        "libnvidia-ml-dev=${kernel_version}-*" \
+        2>>"$LOGFILE"; then
+      log "Userspace libs downgraded to match kernel ${kernel_version}"
+      if nvidia-smi &>/dev/null; then  # stderr expected: driver reinit
+        log "nvidia-smi works after userspace downgrade"
+        detect_nvml_mismatch && post_repair_status=0 || post_repair_status=$?
+        if [[ $post_repair_status -eq 0 ]]; then
+          _pin_nvidia_packages
+          return 0
+        elif [[ $post_repair_status -eq 1 ]]; then
+          warn "NVIDIA driver mismatch persists after userspace downgrade"
+        else
+          warn "Unable to verify NVIDIA driver/library mismatch after userspace downgrade"
+        fi
+      fi
+    else
+      warn "Userspace downgrade to ${kernel_version} failed — trying strategy 3"
+    fi
+  fi
+
+  # ── Strategy 3: Upgrade everything (original approach) ──────────────────
+  log "Strategy 3: Attempting full driver upgrade..."
+  if type -t _wait_for_dpkg_lock >/dev/null 2>&1; then
+    # [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout.
+    _wait_for_dpkg_lock 60 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle"
+  fi
+
+  if apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" update -qq 2>>"$LOGFILE" \
+    && apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" install -y -qq \
+      --only-upgrade "nvidia-driver-*" 2>>"$LOGFILE"; then
+    log "NVIDIA driver upgrade completed"
+    systemctl restart docker 2>>"$LOGFILE" || service docker restart 2>>"$LOGFILE" \
+      || warn "Docker restart failed (non-fatal)"
+    sleep 2
+    if nvidia-smi &>/dev/null; then  # stderr expected: driver reinit
+      detect_nvml_mismatch && post_repair_status=0 || post_repair_status=$?
+      if [[ $post_repair_status -eq 0 ]]; then
+        log "NVIDIA driver mismatch RESOLVED after upgrade"
+        _pin_nvidia_packages
+        return 0
+      elif [[ $post_repair_status -eq 1 ]]; then
+        warn "NVIDIA driver mismatch persists after upgrade"
+      else
+        warn "Unable to verify NVIDIA driver/library mismatch after upgrade"
+      fi
+    else
+      warn "nvidia-smi still fails after upgrade"
+    fi
+  else
+    warn "NVIDIA driver upgrade failed"
+  fi
+
+  warn "All NVML mismatch repair strategies exhausted — GPU may not work"
+  warn "Manual fix: reboot the instance, or try: rmmod nvidia_uvm nvidia_drm nvidia_modeset nvidia && nvidia-smi"
+  return 1
+}
+
+# Pin NVIDIA packages to prevent unattended-upgrades from causing future mismatches
+# (NVIDIA support stats: driver mismatches cause 31% of GPU cluster issues)
+_pin_nvidia_packages() {
+  # Hold nvidia packages so unattended-upgrades can't break them
+  local held=0
+  for pkg in $(dpkg -l | grep -E '^ii\s+(nvidia-driver|nvidia-utils|nvidia-dkms|libnvidia)' | awk '{print $2}'); do
+    apt-mark hold "$pkg" 2>>"$LOGFILE" && held=$((held + 1))
+  done
+  if [[ $held -gt 0 ]]; then
+    log "Pinned ${held} NVIDIA packages (prevents unattended-upgrades mismatch)"
+  fi
+
+  # Also blacklist nvidia from unattended-upgrades if config exists
+  local uu_conf="/etc/apt/apt.conf.d/50unattended-upgrades"
+  if [[ -f "$uu_conf" ]] && ! grep -q 'nvidia' "$uu_conf"; then
+    if grep -q 'Unattended-Upgrade::Package-Blacklist' "$uu_conf"; then
+      # [NON-FATAL: apt] Blacklist update is best-effort; mismatches are handled elsewhere.
+      sed -i '/Unattended-Upgrade::Package-Blacklist/a\    "nvidia-*";' "$uu_conf" 2>>"$LOGFILE" \
+        || warn "Failed to add nvidia to unattended-upgrades blacklist (non-fatal)"
+      log "Added nvidia-* to unattended-upgrades blacklist"
+    fi
+  fi
+}
+
+# ── Post-install fix orchestrator ───────────────────────────────────────────
+# Called by phases/05, subcommands/fix, subcommands/resume.
+# Coordinates all post-install fixes in correct order.
+apply_post_install_fixes() {
+  local ds_dir="$1"
+  local gpu_backend="${2:-auto}"
+  local data_dir="${ds_dir}/data"
+  local env_file="${ds_dir}/.env"
+  local cpu_count docker_cpu compose_ceiling max_cpu
+  cpu_count=$(nproc 2>>"$LOGFILE" || echo 1)
+  docker_cpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "unknown")
+
+  [[ "$gpu_backend" == "auto" ]] && gpu_backend=$(detect_gpu_backend)
+
+  # Docker group membership
+  if getent group docker &>/dev/null; then
+    # [NON-FATAL: permissions] User can still run with sudo or log in again.
+    usermod -aG docker "$DREAM_USER" || warn "docker group add failed (non-fatal)"
+  fi
+
+  # CPU limit fix — cap any cpus: value that exceeds (nproc - 1).
+  # Always run: cheap no-op on files whose values already fit.
+  compose_ceiling=$(get_compose_cpu_ceiling)
+  max_cpu=$(compute_safe_cpu_cap)
+  cap_cpu_in_yaml "$ds_dir" "$max_cpu"
+  log "CPU limits capped to ${max_cpu} (nproc=${cpu_count}, docker=${docker_cpu}, ceiling=${compose_ceiling})"
+
+  # Keep env-substituted CPU limits safe for overlays that use
+  # ${LLAMA_CPU_LIMIT:-...} syntax.
+  if [[ -f "$env_file" ]]; then
+    local llama_limit="${max_cpu}.0"
+    local llama_reservation="2.0"
+    if [[ "$max_cpu" -lt 2 ]]; then
+      llama_reservation="1.0"
+    fi
+    env_set "$env_file" "LLAMA_CPU_LIMIT" "$llama_limit"
+    env_set "$env_file" "LLAMA_CPU_RESERVATION" "$llama_reservation"
+    log "LLAMA CPU env caps set to limit=${llama_limit}, reservation=${llama_reservation}"
+  fi
+
+  _apply_permission_fixes "$ds_dir" "$data_dir" "$gpu_backend"
+  _apply_compatibility_fixes "$ds_dir"
+  _apply_env_defaults "$ds_dir" "$env_file" "$data_dir"
+  ensure_dream_cli_command "$ds_dir"
+  _cap_context_for_vram "$ds_dir"
+
+  # ── [FIX: nvml-mismatch] Post-install NVIDIA driver check (fallback) ──────
+  if [[ "$gpu_backend" == "nvidia" ]]; then
+    log "Checking for NVIDIA driver/library version alignment (post-install)..."
+    if detect_nvml_mismatch; then
+      :
+    else
+      mismatch_status=$?
+      if [[ $mismatch_status -eq 1 ]]; then
+        warn "NVIDIA driver/library mismatch detected post-install (non-fatal)"
+        warn "Run 'bash setup.sh --fix' to repair, or manually upgrade nvidia-driver-*"
+      elif [[ $mismatch_status -eq 2 ]]; then
+        local host_probe_output
+        # [NON-FATAL: probe] NVML probe may fail on transient driver issues.
+        host_probe_output=$(nvidia-smi 2>&1) || warn "nvidia-smi probe failed (non-fatal)"
+        if _has_nvml_mismatch_signature "$host_probe_output"; then
+          warn "Host NVIDIA stack reports driver/library mismatch (non-fatal)"
+          warn "If 'bash setup.sh --fix' cannot recover, reinstall NVIDIA driver package and reboot"
+        fi
+      fi
+    fi
+  fi
+
+  log "Post-install fixes applied (including ACL-based permission system)"
+}
+
+_apply_permission_fixes() {
+  local ds_dir="$1" data_dir="$2" gpu_backend="$3"
+  ensure_acl_tools
+  precreate_extension_data_dirs "$ds_dir"
+  apply_data_acl "$data_dir"
+  fix_known_uid_requirements "$data_dir" "$gpu_backend"
+  configure_dream_umask
+  create_permission_fix_script "$ds_dir"
+  apply_data_acl "${ds_dir}/extensions"
+  if [[ -d "${ds_dir}/user-extensions" ]]; then
+    apply_data_acl "${ds_dir}/user-extensions"
+  fi
+  # [NON-FATAL: scripts] Missing exec bits only affects helper scripts.
+  find "${ds_dir}/scripts" -name "*.sh" -exec chmod +x {} + || warn "chmod scripts failed (non-fatal)"
+  mkdir -p "${ds_dir}/logs"
+  apply_data_acl "${ds_dir}/logs"
+}
+
+_apply_compatibility_fixes() {
+  local ds_dir="$1"
+  ensure_whisper_ui_compatibility "$ds_dir"
+  ensure_webui_stt_model_alignment "$ds_dir"
+  patch_openclaw_inject_token_runtime "$ds_dir"
+}
+
+_apply_env_defaults() {
+  local ds_dir="$1" env_file="$2" data_dir="$3"
+
+  # Seed .env from .env.example if missing (fatal if fails — compose cannot start without all required variables)
+  if [[ ! -f "$env_file" ]]; then
+    local env_example="${ds_dir}/.env.example"
+    if [[ -f "$env_example" ]]; then
+      cp "$env_example" "$env_file" || {
+        err ".env.example copy to ${env_file} failed — Docker Compose cannot start"
+        exit 1
+      }
+      chown "${DREAM_USER}:${DREAM_USER}" "$env_file" || {
+        err ".env ownership fix after copy failed — Docker Compose cannot start"
+        exit 1
+      }
+      chmod 0660 "$env_file" || {
+        err ".env chmod to 0660 after copy failed — Docker Compose cannot start"
+        exit 1
+      }
+      log "Seeded .env from .env.example"
+    else
+      log "No .env.example found; will create .env via env_set()"
+    fi
+  fi
+
+  # Fix .env ownership and permissions if file exists (fatal if fails — compose cannot start without readable .env)
+  if [[ -f "$env_file" ]]; then
+    # Check and fix ownership independently
+    if [[ "$(stat -c '%U' "$env_file" 2>>"$LOGFILE" || echo root)" != "${DREAM_USER}" ]]; then
+      chown "${DREAM_USER}:${DREAM_USER}" "$env_file" || {
+        err ".env ownership fix failed — Docker Compose cannot start"
+        exit 1
+      }
+    fi
+    # Check and fix mode independently
+    if [[ "$(stat -c '%a' "$env_file" 2>>"$LOGFILE")" != "660" ]]; then
+      chmod 0660 "$env_file" || {
+        err ".env chmod to 0660 failed — Docker Compose cannot start"
+        exit 1
+      }
+    fi
+  fi
+
+  # Helper: Replace CHANGEME or empty with generated secret/value
+  _replace_changeme() {
+    local key="$1" value="$2"
+    local current="$(env_get "$env_file" "$key")"
+    if [[ -z "$current" || "$current" == "CHANGEME" ]]; then
+      env_set "$env_file" "$key" "$value"
+      log "Set ${key}"
+    fi
+  }
+
+  # Generate or replace hard-required secrets (compose uses ${VAR:?error} syntax)
+  _replace_changeme "WEBUI_SECRET" "$(openssl rand -hex 32)"
+  _replace_changeme "SEARXNG_SECRET" "$(openssl rand -hex 32)"
+  _replace_changeme "LITELLM_KEY" "sk-dream-$(openssl rand -hex 16)"
+  _replace_changeme "N8N_PASS" "$(openssl rand -hex 16)"
+  _replace_changeme "LIVEKIT_API_KEY" "$(openssl rand -hex 16)"
+  _replace_changeme "LIVEKIT_API_SECRET" "$(openssl rand -hex 32)"
+  _replace_changeme "DIFY_SECRET_KEY" "$(openssl rand -hex 32)"
+  _replace_changeme "OPENCODE_SERVER_PASSWORD" "$(openssl rand -hex 16)"
+
+  # Set non-secret required variables (also checked by compose)
+  _replace_changeme "N8N_USER" "admin@dreamserver.local"
+  _replace_changeme "OPENCLAW_TOKEN" "$(openssl rand -hex 24)"
+  _replace_changeme "DASHBOARD_API_KEY" "$(openssl rand -hex 24)"
+
+  # GGUF_FILE — detect from data/models if not set
+  if [[ -z "$(env_get "$env_file" "GGUF_FILE")" ]]; then
+    local first_model
+    first_model=$(find "${data_dir}/models/" -maxdepth 1 -name "*.gguf" -type f \
+      -printf '%s %f\n' 2>&1 | sort -rn | head -1 | cut -d' ' -f2- || echo "")
+    if [[ -n "$first_model" ]]; then
+      env_set "$env_file" "GGUF_FILE" "$first_model"
+      log "Set GGUF_FILE=${first_model}"
+    fi
+  fi
+}
+
+# ── VRAM-aware context size capping ───────────────────────────────────────
+# The upstream installer sets CTX_SIZE=131072 when Hermes is enabled, but
+# this exceeds VRAM on cards <=24 GB with large models. Cap CTX_SIZE based
+# on available VRAM headroom after model weight, and enable KV cache
+# quantization to maximize usable context within the budget.
+_cap_context_for_vram() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+
+  # Skip if no GPU
+  if [[ "${GPU_BACKEND:-cpu}" == "cpu" ]]; then
+    return 0
+  fi
+
+  local vram_mb="${GPU_VRAM:-0}"
+  local per_gpu_vram_mb="${GPU_VRAM:-0}"
+  local model_size_per_gpu_mb=0
+  local current_ctx model_size_mb headroom_mb safe_ctx kv_quant
+
+  # Multi-GPU: cap by per-GPU VRAM budget to avoid CUDA0 OOM
+  if [[ "${GPU_COUNT:-1}" -ge 2 && "${GPU_TOTAL_VRAM:-0}" -gt 0 ]]; then
+    per_gpu_vram_mb=$(( GPU_TOTAL_VRAM / GPU_COUNT ))
+    if [[ "${GPU_VRAMS+set}" == "set" && "${#GPU_VRAMS[@]}" -gt 0 ]]; then
+      local min_vram="${GPU_VRAMS[0]}"
+      local vram
+      for vram in "${GPU_VRAMS[@]}"; do
+        if [[ "$vram" -lt "$min_vram" ]]; then
+          min_vram="$vram"
+        fi
+      done
+      per_gpu_vram_mb="$min_vram"
+    fi
+  fi
+
+  current_ctx="$(env_get "$env_file" "CTX_SIZE")"
+  current_ctx="${current_ctx:-16384}"
+
+  # Get model size from .env or fallback to TIER_MODEL_SIZE_MB
+  model_size_mb="$(env_get "$env_file" "LLM_MODEL_SIZE_MB")"
+  model_size_mb="${model_size_mb:-${TIER_MODEL_SIZE_MB:-0}}"
+
+  if [[ "$per_gpu_vram_mb" -eq 0 || "$model_size_mb" -eq 0 ]]; then
+    log "VRAM or model size unknown -- skipping context cap"
+    return 0
+  fi
+
+  # Split model weight across GPUs when available; fall back to full size on single GPU.
+  if [[ "${GPU_COUNT:-1}" -ge 2 ]]; then
+    model_size_per_gpu_mb=$(( (model_size_mb + GPU_COUNT - 1) / GPU_COUNT ))
+  else
+    model_size_per_gpu_mb="$model_size_mb"
+  fi
+
+  # Calculate per-GPU headroom (VRAM - model weight per GPU - 1 GB overhead)
+  headroom_mb=$(( per_gpu_vram_mb - model_size_per_gpu_mb - 1024 ))
+
+  if [[ $headroom_mb -le 0 ]]; then
+    # Model barely fits -- use minimum context
+    safe_ctx=2048
+    kv_quant="q4_0"
+    warn "Model (${model_size_mb}MB) nearly exceeds GPU VRAM (${per_gpu_vram_mb}MB) -- setting CTX_SIZE=${safe_ctx}"
+  elif [[ $headroom_mb -le 2048 ]]; then
+    # ~2 GB headroom
+    safe_ctx=4096
+    kv_quant="q4_0"
+  elif [[ $headroom_mb -le 4096 ]]; then
+    # ~4 GB headroom (typical RTX 3090 with 18.6 GB model)
+    safe_ctx=16384
+    kv_quant="q8_0"
+  elif [[ $headroom_mb -le 8192 ]]; then
+    # ~8 GB headroom
+    safe_ctx=32768
+    kv_quant="q8_0"
+  elif [[ $headroom_mb -le 16384 ]]; then
+    # ~16 GB headroom (e.g., RTX 4090 with smaller model)
+    safe_ctx=65536
+    kv_quant="q8_0"
+  else
+    # >16 GB headroom -- large GPU, let it run
+    safe_ctx=131072
+    kv_quant="f16"
+  fi
+
+  if [[ "$current_ctx" -gt "$safe_ctx" ]]; then
+    log "VRAM budget per GPU: ${per_gpu_vram_mb}MB, model per GPU: ${model_size_per_gpu_mb}MB, headroom: ${headroom_mb}MB"
+    log "Capping CTX_SIZE: ${current_ctx} -> ${safe_ctx} (prevents OOM on ${per_gpu_vram_mb}MB GPU)"
+    env_set "$env_file" "CTX_SIZE" "$safe_ctx"
+
+    # Set KV cache quantization to maximize context within VRAM budget
+    local current_kv_k current_kv_v
+    current_kv_k="$(env_get "$env_file" "LLAMA_ARG_CACHE_TYPE_K")"
+    current_kv_v="$(env_get "$env_file" "LLAMA_ARG_CACHE_TYPE_V")"
+
+    if [[ "${current_kv_k:-f16}" == "f16" && "$kv_quant" != "f16" ]]; then
+      env_set "$env_file" "LLAMA_ARG_CACHE_TYPE_K" "$kv_quant"
+      env_set "$env_file" "LLAMA_ARG_CACHE_TYPE_V" "$kv_quant"
+      log "KV cache quantization: f16 -> ${kv_quant} (reduces VRAM, trades some quality)"
+    fi
+  else
+    log "CTX_SIZE=${current_ctx} fits within VRAM budget (${headroom_mb}MB headroom) -- no change"
+  fi
+
+  _cap_batch_for_vram "$env_file" "$per_gpu_vram_mb" "$safe_ctx"
+}
+
+# ── VRAM-aware batch size capping ─────────────────────────────────────────
+# Prevent compute buffer OOM on multi-GPU by bounding batch size per GPU.
+_cap_batch_for_vram() {
+  local env_file="$1" vram_mb="$2" ctx_size="$3"
+  local current_batch safe_batch
+
+  current_batch="$(env_get "$env_file" "LLAMA_BATCH_SIZE")"
+  current_batch="${current_batch:-2048}"
+
+  if [[ "$vram_mb" -le 12288 ]]; then
+    safe_batch=256
+  elif [[ "$vram_mb" -le 16384 ]]; then
+    safe_batch=512
+  elif [[ "$vram_mb" -le 24576 ]]; then
+    safe_batch=1024
+  else
+    safe_batch=2048
+  fi
+
+  if [[ "$ctx_size" -ge 65536 && "$safe_batch" -gt 512 ]]; then
+    safe_batch=512
+  elif [[ "$ctx_size" -ge 32768 && "$safe_batch" -gt 1024 ]]; then
+    safe_batch=1024
+  fi
+
+  if [[ ! "$current_batch" =~ ^[0-9]+$ ]]; then
+    env_set "$env_file" "LLAMA_BATCH_SIZE" "$safe_batch"
+    log "LLAMA_BATCH_SIZE invalid ('${current_batch}') -- set to ${safe_batch}"
+    return 0
+  fi
+
+  if [[ "$current_batch" -gt "$safe_batch" ]]; then
+    env_set "$env_file" "LLAMA_BATCH_SIZE" "$safe_batch"
+    log "Capping LLAMA_BATCH_SIZE: ${current_batch} -> ${safe_batch} (prevents CUDA OOM)"
+  else
+    log "LLAMA_BATCH_SIZE=${current_batch} fits within VRAM budget -- no change"
+  fi
+}
diff --git a/dream-server/installers/p2p-gpu/lib/gpu-topology.sh b/dream-server/installers/p2p-gpu/lib/gpu-topology.sh
new file mode 100644
index 000000000..29a695338
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/gpu-topology.sh
@@ -0,0 +1,366 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Topology & Multi-GPU Assignment
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/lib/
+# Purpose: Per-GPU enumeration, topology detection (NVLink/PCIe),
+#          GPU-to-service assignment delegation, env var writing
+#
+# Expects: GPU_BACKEND, GPU_COUNT, GPU_VRAM, LOGFILE,
+#          log(), warn(), err(), env_set(), env_get()
+# Provides: enumerate_gpus(), generate_topology_json(),
+#           run_gpu_assignment()
+#
+# Modder notes:
+#   All functions are no-ops when GPU_COUNT < 2. Single-GPU path is untouched.
+#   Prefers upstream assign_gpus.py + nvidia-topo.sh when available;
+#   built-in fallback handles pre-clone state.
+#   GPU_UUIDS, GPU_VRAMS, GPU_NAMES are indexed arrays (not associative).
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# ── Per-GPU enumeration ──────────────────────────────────────────────────────
+# Populates: GPU_UUIDS[], GPU_VRAMS[] (MiB each), GPU_NAMES[], GPU_TOTAL_VRAM
+enumerate_gpus() {
+  [[ "${GPU_COUNT:-0}" -lt 2 ]] && return 0
+
+  GPU_UUIDS=()
+  GPU_VRAMS=()
+  GPU_NAMES=()
+  GPU_TOTAL_VRAM=0
+
+  if [[ "${GPU_BACKEND:-}" == "nvidia" ]]; then
+    while IFS=', ' read -r uuid vram name; do
+      [[ -z "$uuid" ]] && continue
+      GPU_UUIDS+=("$uuid")
+      GPU_VRAMS+=("${vram%%.*}")  # truncate decimals
+      GPU_NAMES+=("$name")
+      GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + ${vram%%.*} ))
+    # [NON-FATAL: probe] Topology is best-effort; fallback uses env values.
+    done < <(nvidia-smi --query-gpu=gpu_uuid,memory.total,name \
+      --format=csv,noheader,nounits 2>>"$LOGFILE" || warn "nvidia-smi GPU enumeration failed (non-fatal)")
+
+  elif [[ "${GPU_BACKEND:-}" == "amd" ]]; then
+    local idx=0
+    while IFS= read -r line; do
+      [[ -z "$line" ]] && continue
+      local gpu_name
+      gpu_name=$(rocm-smi -d "$idx" --showproductname 2>>"$LOGFILE" \
+        | grep -oP 'Card series:\s*\K.*' || echo "AMD GPU $idx")
+      local vram_bytes
+      vram_bytes=$(rocm-smi -d "$idx" --showmeminfo vram 2>>"$LOGFILE" \
+        | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' || echo "0")
+      local vram_mb=$(( vram_bytes / 1048576 ))
+      [[ $vram_mb -lt 1000 ]] && vram_mb=${GPU_VRAM:-0}  # fallback
+
+      GPU_UUIDS+=("AMD-GPU-${idx}")
+      GPU_VRAMS+=("$vram_mb")
+      GPU_NAMES+=("$gpu_name")
+      GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + vram_mb ))
+      idx=$((idx + 1))
+    done < <(rocm-smi --showid 2>>"$LOGFILE" | grep 'GPU\[' || echo "")
+  fi
+
+  # Sanity: if enumeration failed, fall back to count * per-GPU
+  if [[ ${#GPU_UUIDS[@]} -eq 0 ]]; then
+    GPU_TOTAL_VRAM=$(( ${GPU_VRAM:-0} * ${GPU_COUNT:-1} ))
+    warn "GPU enumeration failed — estimated total VRAM: ${GPU_TOTAL_VRAM} MiB"
+  fi
+}
+
+# ── Topology JSON generation ─────────────────────────────────────────────────
+# Builds JSON matching upstream assign_gpus.py input schema.
+# Args: $1 = output file path
+generate_topology_json() {
+  local output_file="$1"
+  [[ "${GPU_COUNT:-0}" -lt 2 ]] && return 0
+
+  # Strategy 1: Use upstream nvidia-topo.sh if cloned
+  if [[ -n "${DS_DIR:-}" && -f "${DS_DIR}/installers/lib/nvidia-topo.sh" \
+        && "${GPU_BACKEND:-}" == "nvidia" ]]; then
+    local upstream_topo
+    upstream_topo=$(
+      # Source upstream functions in subshell
+      warn() { echo "WARN: $*" >&2; }
+      err()  { echo "ERR: $*" >&2; }
+      source "${DS_DIR}/installers/lib/nvidia-topo.sh" 2>>"$LOGFILE"
+      detect_nvidia_topo 2>>"$LOGFILE"
+    ) || upstream_topo=""
+    if [[ -n "$upstream_topo" && "$upstream_topo" != "{}" ]]; then
+      echo "$upstream_topo" > "$output_file"
+      log "Topology generated via upstream nvidia-topo.sh"
+      return 0
+    fi
+  fi
+
+  # Strategy 2: Built-in — enumerate GPUs + parse topo matrix
+  _generate_builtin_topology "$output_file"
+}
+
+_generate_builtin_topology() {
+  local output_file="$1"
+
+  # Build gpus array
+  local gpus_json="["
+  for i in "${!GPU_UUIDS[@]}"; do
+    local mem_gb
+    mem_gb=$(awk "BEGIN {printf \"%.1f\", ${GPU_VRAMS[$i]} / 1024}")
+    [[ $i -gt 0 ]] && gpus_json+=","
+    gpus_json+="{\"index\":${i},\"uuid\":\"${GPU_UUIDS[$i]}\",\"name\":\"${GPU_NAMES[$i]}\",\"memory_gb\":${mem_gb}}"
+  done
+  gpus_json+="]"
+
+  # Build links array from nvidia-smi topo -m
+  local links_json="[]"
+  if [[ "${GPU_BACKEND:-}" == "nvidia" ]]; then
+    links_json=$(_parse_nvidia_topo_links)
+  fi
+
+  cat > "$output_file" << TOPO_EOF
+{
+  "vendor": "${GPU_BACKEND:-unknown}",
+  "gpu_count": ${#GPU_UUIDS[@]},
+  "gpus": ${gpus_json},
+  "links": ${links_json}
+}
+TOPO_EOF
+
+  log "Topology generated (built-in): ${#GPU_UUIDS[@]} GPUs"
+}
+
+_parse_nvidia_topo_links() {
+  # Parse nvidia-smi topo -m matrix into JSON links array
+  local matrix
+  matrix=$(nvidia-smi topo -m 2>>"$LOGFILE") || { echo "[]"; return; }
+
+  # Strip ANSI escape codes
+  matrix=$(echo "$matrix" | sed 's/\x1b\[[0-9;]*m//g')
+
+  local header_line
+  header_line=$(echo "$matrix" | grep -E '^\s+GPU[0-9]' | head -1 || echo "")
+  [[ -z "$header_line" ]] && { echo "[]"; return; }
+
+  local -a headers
+  read -ra headers <<< "$header_line"
+
+  local links="["
+  local first=true
+
+  while IFS= read -r line; do
+    [[ "$line" =~ ^[[:space:]] ]] && continue
+    [[ -z "$line" ]] && continue
+    local row_label
+    row_label=$(echo "$line" | awk '{print $1}')
+    [[ "$row_label" =~ ^GPU[0-9]+$ ]] || continue
+    local gpu_a="${row_label#GPU}"
+    local -a cells
+    read -ra cells <<< "$line"
+
+    for col_idx in "${!headers[@]}"; do
+      local col_header="${headers[$col_idx]}"
+      [[ "$col_header" =~ ^GPU[0-9]+$ ]] || continue
+      local gpu_b="${col_header#GPU}"
+      [[ "$gpu_a" -ge "$gpu_b" ]] && continue  # upper triangle only
+
+      local cell_idx=$(( col_idx + 1 ))  # +1 for row label
+      local link_type="${cells[$cell_idx]:-X}"
+      [[ "$link_type" == "X" ]] && continue  # self
+
+      local rank
+      rank=$(_link_rank "$link_type")
+      local label
+      label=$(_link_label "$link_type")
+
+      [[ "$first" != "true" ]] && links+=","
+      first=false
+      links+="{\"gpu_a\":${gpu_a},\"gpu_b\":${gpu_b},\"link_type\":\"${link_type}\",\"link_label\":\"${label}\",\"rank\":${rank}}"
+    done
+  done <<< "$matrix"
+
+  links+="]"
+  echo "$links"
+}
+
+# Link rank/label matching upstream nvidia-topo.sh
+_link_rank() {
+  case "$1" in
+    NV4|NV6|NV8|NV12|NV18) echo 100 ;;
+    XGMI|XGMI2)            echo 90  ;;
+    NV1|NV2|NV3)            echo 80  ;;
+    MIG)                    echo 70  ;;
+    PIX)                    echo 50  ;;
+    PXB)                    echo 40  ;;
+    PHB)                    echo 30  ;;
+    NODE)                   echo 20  ;;
+    SYS|SOC)                echo 10  ;;
+    *)                      echo 0   ;;
+  esac
+}
+
+_link_label() {
+  case "$1" in
+    NV*)   echo "NVLink" ;;
+    XGMI*) echo "InfinityFabric" ;;
+    MIG)   echo "MIG-SameDie" ;;
+    PIX)   echo "PCIe-SameSwitch" ;;
+    PXB)   echo "PCIe-CrossSwitch" ;;
+    PHB)   echo "PCIe-HostBridge" ;;
+    NODE)  echo "SameNUMA-NoBridge" ;;
+    SYS|SOC) echo "CrossNUMA" ;;
+    *)     echo "Unknown" ;;
+  esac
+}
+
+# ── GPU-to-service assignment ─────────────────────────────────────────────────
+# Args: $1 = ds_dir, $2 = env_file
+run_gpu_assignment() {
+  local ds_dir="$1" env_file="$2"
+  [[ "${GPU_COUNT:-0}" -lt 2 ]] && return 0
+
+  if [[ "${GPU_UUIDS+set}" != "set" ]]; then
+    enumerate_gpus
+  elif [[ "${#GPU_UUIDS[@]}" -eq 0 ]]; then
+    enumerate_gpus
+  fi
+
+  local topo_file="/tmp/ds-gpu-topo-$$.json"
+  generate_topology_json "$topo_file"
+  [[ ! -f "$topo_file" ]] && { warn "Topology file not generated — skipping assignment"; return 0; }
+
+  local model_size_mb
+  model_size_mb=$(env_get "$env_file" "LLM_MODEL_SIZE_MB")
+  model_size_mb="${model_size_mb:-${TIER_MODEL_SIZE_MB:-5760}}"
+
+  local assign_script="${ds_dir}/scripts/assign_gpus.py"
+  local result=""
+
+  # Strategy 1: Upstream assign_gpus.py
+  if [[ -f "$assign_script" ]] && command -v python3 &>/dev/null; then
+    result=$(python3 "$assign_script" \
+      --topology "$topo_file" \
+      --model-size "$model_size_mb" 2>&1) || {
+      warn "assign_gpus.py failed: ${result}"
+      result=""
+    }
+  fi
+
+  if [[ -n "$result" ]] && echo "$result" | jq -e '.gpu_assignment' &>/dev/null; then
+    _write_assignment_from_json "$result" "$env_file"
+    log "GPU assignment via upstream assign_gpus.py"
+  else
+    # Strategy 2: Built-in fallback — all GPUs to llama
+    _write_builtin_assignment "$env_file"
+    log "GPU assignment via built-in fallback (all GPUs → llama)"
+  fi
+
+  # Save topology for dashboard-api
+  mkdir -p "${ds_dir}/config"
+  # [NON-FATAL: telemetry] Topology persistence only aids dashboard visibility.
+  cp "$topo_file" "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to persist gpu-topology.json (non-fatal)"
+  # [NON-FATAL: telemetry] Topology persistence only aids dashboard visibility.
+  chmod 644 "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to set mode on gpu-topology.json (non-fatal)"
+
+  # Enable P2P transfers when NVLink detected (avoids host RAM round-trip)
+  if [[ -f "$topo_file" ]] && jq -e '.links[] | select(.link_type | startswith("NV"))' "$topo_file" &>/dev/null; then
+    env_set "$env_file" "GGML_CUDA_P2P" "1"
+    log "NVLink detected — enabled GGML_CUDA_P2P for direct GPU-to-GPU transfers"
+  fi
+
+  rm -f "$topo_file"
+}
+
+_map_llama_split_mode() {
+  case "${1:-}" in
+    ""|none|null) echo "none" ;;
+    tensor|hybrid) echo "row" ;;
+    pipeline) echo "layer" ;;
+    layer|row) echo "$1" ;;
+    *)
+      warn "Unknown split mode '${1}' from assign_gpus.py; defaulting to layer"
+      echo "layer"
+      ;;
+  esac
+}
+
+_ensure_numeric_main_gpu() {
+  local env_file="$1" split_mode="$2"
+  local main_gpu
+  main_gpu="$(env_get "$env_file" "LLAMA_ARG_MAIN_GPU")"
+  if [[ -z "$main_gpu" || ! "$main_gpu" =~ ^[0-9]+$ ]]; then
+    if [[ -n "$main_gpu" ]]; then
+      warn "Invalid LLAMA_ARG_MAIN_GPU='${main_gpu}' — resetting to 0"
+    fi
+    if [[ "$split_mode" != "none" ]]; then
+      env_set "$env_file" "LLAMA_ARG_MAIN_GPU" "0"
+    fi
+  fi
+}
+
+_write_assignment_from_json() {
+  local json="$1" env_file="$2"
+
+  local llama_uuids split_mode tensor_split
+  llama_uuids=$(echo "$json" | jq -r '.gpu_assignment.services.llama_server.gpus // [] | join(",")') || llama_uuids=""
+  split_mode=$(echo "$json" | jq -r '.gpu_assignment.services.llama_server.parallelism.mode // "none"') || split_mode="none"
+  split_mode=$(_map_llama_split_mode "$split_mode")
+  tensor_split=$(echo "$json" | jq -r '
+    .gpu_assignment.services.llama_server as $svc |
+    if $svc.parallelism.tensor_split then ($svc.parallelism.tensor_split | map(tostring) | join(","))
+    else "" end') || tensor_split=""
+
+  [[ -n "$llama_uuids" ]] && env_set "$env_file" "LLAMA_SERVER_GPU_UUIDS" "$llama_uuids"
+  env_set "$env_file" "LLAMA_ARG_SPLIT_MODE" "$split_mode"
+  [[ -n "$tensor_split" ]] && env_set "$env_file" "LLAMA_ARG_TENSOR_SPLIT" "$tensor_split"
+
+  local main_gpu
+  main_gpu=$(echo "$json" | jq -r '.gpu_assignment.services.llama_server.parallelism.main_gpu_index // empty') || main_gpu=""
+  if [[ "$main_gpu" =~ ^[0-9]+$ ]]; then
+    env_set "$env_file" "LLAMA_ARG_MAIN_GPU" "$main_gpu"
+  fi
+  _ensure_numeric_main_gpu "$env_file" "$split_mode"
+
+  # Per-service GPU UUIDs
+  local svc uuid
+  for svc in whisper comfyui embeddings; do
+    uuid=$(echo "$json" | jq -r ".gpu_assignment.services.${svc}.gpus[0]? // empty") || uuid=""
+    local env_key
+    case "$svc" in
+      whisper)    env_key="WHISPER_GPU_UUID" ;;
+      comfyui)    env_key="COMFYUI_GPU_UUID" ;;
+      embeddings) env_key="EMBEDDINGS_GPU_UUID" ;;
+    esac
+    [[ -n "$uuid" && "$uuid" != "null" ]] && env_set "$env_file" "$env_key" "$uuid"
+  done
+
+  env_set "$env_file" "GPU_COUNT" "${GPU_COUNT}"
+  log "Multi-GPU env vars written: llama=[${llama_uuids}] mode=${split_mode}"
+}
+
+_write_builtin_assignment() {
+  local env_file="$1"
+
+  # All GPUs → llama-server with pipeline parallelism
+  local all_uuids=""
+  for uuid in "${GPU_UUIDS[@]}"; do
+    [[ -n "$all_uuids" ]] && all_uuids+=","
+    all_uuids+="$uuid"
+  done
+
+  # VRAM-proportional tensor_split
+  local split=""
+  for vram in "${GPU_VRAMS[@]}"; do
+    [[ -n "$split" ]] && split+=","
+    split+="$vram"
+  done
+
+  [[ -n "$all_uuids" ]] && env_set "$env_file" "LLAMA_SERVER_GPU_UUIDS" "$all_uuids"
+  env_set "$env_file" "LLAMA_ARG_SPLIT_MODE" "layer"
+  [[ -n "$split" ]] && env_set "$env_file" "LLAMA_ARG_TENSOR_SPLIT" "$split"
+  env_set "$env_file" "GPU_COUNT" "${GPU_COUNT}"
+  _ensure_numeric_main_gpu "$env_file" "layer"
+
+  log "Built-in assignment: all ${GPU_COUNT} GPUs → llama, mode=layer, split=${split}"
+}
diff --git a/dream-server/installers/p2p-gpu/lib/logging.sh b/dream-server/installers/p2p-gpu/lib/logging.sh
new file mode 100644
index 000000000..9e955b357
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/logging.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Logging & Lifecycle
+# ============================================================================
+# Part of: p2p-gpu/lib/
+# Purpose: Log/warn/err/step functions, timestamp helper, cleanup trap,
+#          flock-based lock acquisition
+#
+# Expects: LOGFILE, LOCKFILE, RED, GREEN, YELLOW, CYAN, BOLD, NC
+# Provides: _ts(), log(), warn(), err(), step(), setup_cleanup_trap(),
+#           acquire_lock()
+#
+# Modder notes:
+#   Log writes use append-or-silent ( || : ) to avoid infinite recursion
+#
+#   if the logfile itself is unwritable. This is the ONE intentional
+#
+#   deviation from CLAUDE.md §4's "never || true" rule: the logging
+#
+#   functions ARE the warn() path, so they cannot call warn() on their
+#
+#   own failure without recursing. The 4 uses below are the only || :
+#
+#   in the entire toolkit.
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+_ts() { date '+%Y-%m-%d %H:%M:%S'; }
+
+log() {
+  echo -e "${GREEN}[✓]${NC} $*"
+  echo "$(_ts) [INFO]  $*" >> "$LOGFILE" || :
+}
+
+warn() {
+  echo -e "${YELLOW}[!]${NC} $*"
+  echo "$(_ts) [WARN]  $*" >> "$LOGFILE" || :
+}
+
+err() {
+  echo -e "${RED}[✗]${NC} $*" >&2
+  echo "$(_ts) [ERROR] $*" >> "$LOGFILE" || :
+}
+
+step() {
+  echo -e "\n${CYAN}${BOLD}━━━ $* ━━━${NC}\n"
+  echo "$(_ts) [STEP]  $*" >> "$LOGFILE" || :
+}
+
+# ── Cleanup trap ────────────────────────────────────────────────────────────
+setup_cleanup_trap() {
+  _vastai_cleanup() {
+    local exit_code=$?
+    if [[ $exit_code -ne 0 ]]; then
+      err "Script failed at line ${BASH_LINENO[0]:-unknown} (exit code: ${exit_code})"
+      err "Full log: ${LOGFILE}"
+      err "Last 10 lines:"
+      tail -10 "$LOGFILE" 2>&1 | sed 's/^/  /' || warn "could not read log tail"
+      echo ""
+      echo -e "${YELLOW}${BOLD}  What to try next:${NC}"
+      echo -e "    ${BOLD}bash $0 --fix${NC}      Apply fixes and restart services"
+      echo -e "    ${BOLD}bash $0 --resume${NC}   Quick restart (skip install phases)"
+      echo -e "    ${BOLD}bash $0 --status${NC}   Check what's actually running"
+      echo ""
+    fi
+    # Release flock (fd 9 auto-closes on exit)
+    exit "$exit_code"
+  }
+  trap _vastai_cleanup EXIT
+  trap 'err "Interrupted by signal"; exit 130' INT TERM HUP
+}
+
+# ── Flock-based lock ────────────────────────────────────────────────────────
+acquire_lock() {
+  exec 9>"$LOCKFILE"
+  if ! flock -n 9; then
+    err "Another instance is already running."
+    echo -e "  ${YELLOW}Wait for it to finish, or force remove:${NC} rm ${LOCKFILE}"
+    exit 1
+  fi
+}
+
+# ── dpkg lock helper (used by phases 00 and 01) ─────────────────────────────
+# Waits for the dpkg frontend lock to be released, killing unattended-upgrades
+# if it's the holder. Returns 0 when lock is free, 1 on timeout.
+_wait_for_dpkg_lock() {
+  local max_wait="${1:-90}"
+
+  if ! fuser /var/lib/dpkg/lock-frontend &>/dev/null; then  # stderr expected: fuser probe
+    return 0  # Lock is free
+  fi
+
+  log "dpkg lock held — attempting to release (timeout ${max_wait}s)"
+
+  # Stop unattended-upgrades if it's the culprit
+  if ps aux | grep -q "[u]nattended-upgrade"; then
+    log "Stopping unattended-upgrades service..."
+    systemctl stop unattended-upgrades 2>>"$LOGFILE" || warn "systemctl stop failed (non-fatal)"
+    # Also kill any lingering child processes
+    pkill -f unattended-upgrade 2>/dev/null || warn "no unattended-upgrade process found (non-fatal)"  # stderr expected: no matching process
+  fi
+
+  # Poll until lock is released
+  local elapsed=0
+  while fuser /var/lib/dpkg/lock-frontend &>/dev/null; do  # stderr expected: fuser probe
+    if [[ $elapsed -ge $max_wait ]]; then
+      warn "dpkg lock still held after ${max_wait}s — proceeding with DPkg::Lock::Timeout"
+      return 1
+    fi
+    sleep 3
+    elapsed=$((elapsed + 3))
+    (( elapsed % 15 == 0 )) && log "Still waiting for dpkg lock... (${elapsed}s / ${max_wait}s)"
+  done
+
+  log "dpkg lock released after ${elapsed}s"
+
+  # Clean up any interrupted package state
+  if ! dpkg --configure -a 2>>"$LOGFILE"; then
+    warn "dpkg --configure -a failed (non-fatal) — DPkg::Lock::Timeout will handle"
+  fi
+
+  return 0
+}
diff --git a/dream-server/installers/p2p-gpu/lib/models.sh b/dream-server/installers/p2p-gpu/lib/models.sh
new file mode 100644
index 000000000..0fd4d1f4b
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/models.sh
@@ -0,0 +1,440 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Model Management
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/lib/
+# Purpose: Model URL resolution, aria2c-optimized downloads, model swap
+#          watcher for background upgrades, disk-space gating
+#
+# Expects: LOGFILE, PIDFILE_DIR, log(), warn(), env_get(), env_set()
+# Provides: resolve_model_url(), optimize_model_download(),
+#           create_model_swap_watcher(), check_disk_for_download()
+#
+# Modder notes:
+#   resolve_model_url tries 4 strategies in priority order:
+#     1. model-upgrade log  2. upstream tier-map.sh
+#     3. backend JSON configs  4. HuggingFace org probing
+#   create_model_swap_watcher generates a self-contained script that polls
+#   for aria2c completion and hot-swaps the active model.
+#   PIDs are tracked in PIDFILE_DIR for safe cleanup (no pkill -f).
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# ── GPU-aware tier model resolution ───────────────────────────────────────────
+# Maps GPU VRAM (MB) to the optimal tier model. Mirrors the upstream tier-map
+# logic from dream-server/installers/lib/tier-map.sh but is self-contained so
+# p2p-gpu stays isolated from the core codebase.
+#
+# If the upstream tier-map.sh exists (after DreamServer is cloned), we source it
+# directly for accuracy. Otherwise, fall back to a built-in VRAM lookup table.
+#
+# Sets: TIER_GGUF_FILE, TIER_GGUF_URL, TIER_MODEL_SIZE_MB
+# Args: $1 = ds_dir, $2 = gpu_backend, $3 = gpu_vram_mb, $4 = gpu_count
+resolve_tier_for_gpu() {
+  local ds_dir="$1" gpu_backend="$2" vram_mb="${3:-0}" gpu_count="${4:-1}"
+  local tier_map="${ds_dir}/installers/lib/tier-map.sh"
+
+  local total_vram_mb="${GPU_TOTAL_VRAM:-$(( vram_mb * gpu_count ))}"
+  local reserve_mb_per_gpu="${P2P_TIER_VRAM_RESERVE_MB:-1024}"
+  local effective_vram_mb="$vram_mb"
+  if [[ "$gpu_count" -ge 2 ]]; then
+    local reserve_total=$(( reserve_mb_per_gpu * gpu_count ))
+    if [[ "$total_vram_mb" -gt "$reserve_total" ]]; then
+      effective_vram_mb=$(( total_vram_mb - reserve_total ))
+    else
+      effective_vram_mb="$total_vram_mb"
+    fi
+    log "Tier VRAM budget: per_gpu=${vram_mb}MB total=${total_vram_mb}MB reserve=${reserve_mb_per_gpu}MB x${gpu_count} -> effective=${effective_vram_mb}MB"
+  fi
+
+  TIER_GGUF_FILE=""
+  TIER_GGUF_URL=""
+  TIER_MODEL_SIZE_MB=0
+
+  # Strategy 1: Use upstream tier-map.sh if available (most accurate)
+  if [[ -f "$tier_map" ]]; then
+    local tier=""
+    if [[ "$gpu_backend" == "nvidia" ]]; then
+      if [[ $effective_vram_mb -ge 90000 ]]; then tier="NV_ULTRA"
+      elif [[ $effective_vram_mb -ge 40000 ]]; then tier=4
+      elif [[ $effective_vram_mb -ge 20000 ]]; then tier=3
+      elif [[ $effective_vram_mb -ge 12000 ]]; then tier=2
+      elif [[ $effective_vram_mb -lt 4000 ]]; then tier=0
+      else tier=1; fi
+    elif [[ "$gpu_backend" == "amd" ]]; then
+      if [[ $effective_vram_mb -ge 20000 ]]; then tier=3
+      elif [[ $effective_vram_mb -ge 12000 ]]; then tier=2
+      else tier=1; fi
+    else
+      tier=0  # CPU-only
+    fi
+
+    # Source upstream tier-map in a subshell to avoid polluting our namespace
+    local result
+    result=$(
+      TIER="$tier"
+      MODEL_PROFILE="${MODEL_PROFILE:-qwen}"
+      error() { echo "ERROR: $*" >&2; return 1; }
+      source "$tier_map" 2>>"$LOGFILE"
+      resolve_tier_config 2>>"$LOGFILE"
+      echo "${GGUF_FILE}|${GGUF_URL:-}|${LLM_MODEL_SIZE_MB:-0}"
+    ) || result=""
+
+    if [[ -n "$result" ]]; then
+      TIER_GGUF_FILE="${result%%|*}"
+      local rest="${result#*|}"
+      TIER_GGUF_URL="${rest%%|*}"
+      TIER_MODEL_SIZE_MB="${rest##*|}"
+      if [[ -n "$TIER_GGUF_FILE" ]]; then
+        log "Tier resolved via upstream tier-map: ${TIER_GGUF_FILE} (tier ${tier}, ${effective_vram_mb}MB effective VRAM)"
+        return 0
+      fi
+    fi
+  fi
+
+  # Strategy 2: Built-in VRAM lookup (fallback when tier-map.sh unavailable)
+  # Uses qwen profile defaults matching upstream's set_qwen_tier_config()
+  if [[ "$gpu_backend" == "nvidia" || "$gpu_backend" == "amd" ]]; then
+    local effective_vram="$effective_vram_mb"
+
+    if [[ $effective_vram -ge 90000 ]]; then
+      # NV_ULTRA: B200 (180GB), multi-A100/H100, etc.
+      TIER_GGUF_FILE="qwen3-coder-next-Q4_K_M.gguf"
+      TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3-Coder-Next-GGUF/resolve/main/Qwen3-Coder-Next-Q4_K_M.gguf"
+      TIER_MODEL_SIZE_MB=48500
+    elif [[ $effective_vram -ge 24000 ]]; then
+      # Tier 3-4: RTX 3090/4090 (24GB), A6000 (48GB), A100 (40/80GB), H100 (80GB)
+      TIER_GGUF_FILE="Qwen3-30B-A3B-Q4_K_M.gguf"
+      TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF/resolve/main/Qwen3-30B-A3B-Q4_K_M.gguf"
+      TIER_MODEL_SIZE_MB=18600
+    elif [[ $effective_vram -ge 12000 ]]; then
+      # Tier 2: RTX 3060 (12GB), RTX 4070 (12GB), RTX 3080 Ti (12GB)
+      TIER_GGUF_FILE="Qwen3.5-9B-Q4_K_M.gguf"
+      TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
+      TIER_MODEL_SIZE_MB=5760
+    elif [[ $effective_vram -ge 4000 ]]; then
+      # Tier 1: RTX 3070 (8GB), RTX 3080 (10GB), GPUs with 4-12GB VRAM
+      # 4B model (2,870 MB) leaves enough headroom for KV cache on 8GB GPUs
+      TIER_GGUF_FILE="Qwen3.5-4B-Q4_K_M.gguf"
+      TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf"
+      TIER_MODEL_SIZE_MB=2870
+    else
+      # Tier 0: <4GB VRAM or CPU-only
+      TIER_GGUF_FILE="Qwen3.5-2B-Q4_K_M.gguf"
+      TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf"
+      TIER_MODEL_SIZE_MB=1500
+    fi
+  else
+    TIER_GGUF_FILE="Qwen3.5-2B-Q4_K_M.gguf"
+    TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf"
+    TIER_MODEL_SIZE_MB=1500
+  fi
+
+  log "Tier resolved via built-in lookup: ${TIER_GGUF_FILE} (${effective_vram_mb}MB effective VRAM)"
+}
+
+# ── [FIX: disk-check] Verify sufficient disk before starting a download ─────
+# Returns 0 if enough space, 1 if insufficient.
+# Args: $1 = directory to check, $2 = minimum GB required (default: 5)
+check_disk_for_download() {
+  local target_dir="$1"
+  local min_gb="${2:-5}"
+  local avail_gb
+  avail_gb=$(df -BG --output=avail "$target_dir" 2>>"$LOGFILE" | tail -1 | tr -dc '0-9')
+  if [[ "${avail_gb:-0}" -lt "$min_gb" ]]; then
+    warn "Insufficient disk space: ${avail_gb}GB available, ${min_gb}GB needed in ${target_dir}"
+    return 1
+  fi
+  return 0
+}
+
+# ── [FIX: pkill] PID-file based process management ─────────────────────────
+# Store a background process PID so we can stop it safely later.
+_store_pid() {
+  local name="$1" pid="$2"
+  # [NON-FATAL: pidfile] Missing pidfile dir only affects cleanup tracking.
+  mkdir -p "$PIDFILE_DIR" 2>>"$LOGFILE" || warn "could not create pidfile directory ${PIDFILE_DIR} (non-fatal)"
+  echo "$pid" > "${PIDFILE_DIR}/${name}.pid"
+}
+
+# Kill a previously stored PID by name. Safe — only kills the exact PID.
+_kill_stored_pid() {
+  local name="$1"
+  local pidfile="${PIDFILE_DIR}/${name}.pid"
+  [[ ! -f "$pidfile" ]] && return 0
+  local pid
+  pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "")
+  if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then  # stderr expected: process may already have exited
+    # [NON-FATAL: cleanup] Process may already be gone; continue cleanup.
+    kill "$pid" 2>>"$LOGFILE" || warn "Could not kill ${name} (PID ${pid})"
+  fi
+  rm -f "$pidfile"
+}
+
+# Check if a stored PID is still running.
+_is_pid_running() {
+  local name="$1"
+  local pidfile="${PIDFILE_DIR}/${name}.pid"
+  [[ ! -f "$pidfile" ]] && return 1
+  local pid
+  pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "")
+  [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: process may already have exited
+}
+
+# Resolve download URL for a model filename
+resolve_model_url() {
+  local ds_dir="$1" model_name="$2"
+
+  # Strategy 1: model-upgrade log
+  local url
+  url=$(_resolve_from_log "$ds_dir" "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; }
+
+  # Strategy 2: upstream tier-map.sh
+  url=$(_resolve_from_tiermap "$ds_dir" "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; }
+
+  # Strategy 3: backend JSON configs
+  url=$(_resolve_from_backends "$ds_dir" "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; }
+
+  # Strategy 4: probe common HuggingFace orgs
+  url=$(_resolve_from_hf_probe "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; }
+
+  return 1
+}
+
+_resolve_from_log() {
+  local ds_dir="$1" model_name="$2"
+  local upgrade_log="${ds_dir}/logs/model-upgrade.log"
+  [[ ! -f "$upgrade_log" ]] && return 1
+  grep -oP 'https://huggingface\.co/[^\s"]+'"${model_name}" "$upgrade_log" | tail -1 || return 1
+}
+
+_resolve_from_tiermap() {
+  local ds_dir="$1" model_name="$2"
+  local tier_map="${ds_dir}/installers/lib/tier-map.sh"
+  [[ ! -f "$tier_map" ]] && return 1
+  grep -oP 'https://huggingface\.co/[^\s"'"'"']+'"${model_name}" "$tier_map" | head -1 || return 1
+}
+
+_resolve_from_backends() {
+  local ds_dir="$1" model_name="$2"
+  local backend_dir="${ds_dir}/config/backends"
+  [[ ! -d "$backend_dir" ]] && return 1
+  grep -rhoP 'https://huggingface\.co/[^\s"]+'"${model_name}" "$backend_dir" | head -1 || return 1
+}
+
+_resolve_from_hf_probe() {
+  local model_name="$1"
+  local base_name
+  base_name=$(echo "$model_name" | sed -E 's/-[QqFf][0-9_]+[A-Za-z]*\.gguf$//')
+  [[ -z "$base_name" ]] && return 1
+
+  local org
+  for org in "unsloth" "bartowski" "lmstudio-community"; do
+    local test_url="https://huggingface.co/${org}/${base_name}-GGUF/resolve/main/${model_name}"
+    if curl -sfI --max-time 10 "$test_url" | grep -qi "200\|302\|301"; then
+      echo "$test_url"
+      return 0
+    fi
+  done
+  return 1
+}
+
+# Resume/restart incomplete model downloads with aria2c
+optimize_model_download() {
+  local ds_dir="$1"
+  local data_dir="${ds_dir}/data"
+
+  local part_files
+  part_files=$(find "${data_dir}/models/" -name "*.gguf.part" -type f 2>&1 || echo "")
+
+  if [[ -z "$part_files" ]]; then
+    if _is_pid_running "aria2c-model"; then
+      log "aria2c download already running"
+      return 0
+    fi
+    log "No incomplete model downloads found — models are ready"
+    return 0
+  fi
+
+  local part_file part_name part_size_mb gguf_url
+  part_file=$(echo "$part_files" | head -1)
+  part_name=$(basename "$part_file" .part)
+  part_size_mb=$(( $(stat -c%s "$part_file" || echo 0) / 1048576 ))
+
+  warn "Incomplete download: ${part_name} (${part_size_mb} MB so far)"
+
+  # [FIX: pkill] Kill only known PIDs, not by pattern
+  _kill_stored_pid "curl-model"
+  _kill_stored_pid "wget-model"
+  sleep 2
+
+  # [FIX: disk-check] Verify at least 5GB free before resuming
+  if ! check_disk_for_download "${data_dir}/models" 5; then
+    warn "Skipping model download — insufficient disk space"
+    return 0
+  fi
+
+  gguf_url=$(resolve_model_url "$ds_dir" "$part_name") || {
+    warn "Could not resolve download URL for ${part_name} — leaving original download"
+    return 0
+  }
+
+  log "Restarting download with aria2c (8 threads)..."
+  rm -f "$part_file"
+  mkdir -p "${ds_dir}/logs"
+
+  nohup aria2c \
+    -x 8 -s 8 -k 10M \
+    --continue=true \
+    --max-tries=0 \
+    --retry-wait=5 \
+    --timeout=60 \
+    --connect-timeout=30 \
+    --file-allocation=none \
+    --auto-file-renaming=false \
+    --console-log-level=warn \
+    --summary-interval=30 \
+    --check-integrity=true \
+    -d "${data_dir}/models" \
+    -o "${part_name}" \
+    "${gguf_url}" \
+    >> "${ds_dir}/logs/aria2c-download.log" 2>&1 &
+
+  local aria_pid=$!
+  _store_pid "aria2c-model" "$aria_pid"
+  log "aria2c started (PID: ${aria_pid})"
+  create_model_swap_watcher "$ds_dir" "$part_name"
+}
+
+# Generate and start a model swap watcher script
+create_model_swap_watcher() {
+  local ds_dir="$1" model_name="$2"
+  local watcher_script="${ds_dir}/scripts/model-swap-on-complete.sh"
+  local pidfile_dir="${PIDFILE_DIR:-/var/run/dreamserver-p2p-gpu}"
+  mkdir -p "${ds_dir}/scripts"
+
+  cat > "$watcher_script" << 'WATCHER_EOF'
+#!/usr/bin/env bash
+set -euo pipefail
+# Auto-swap model when aria2c download completes
+
+SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+MODEL_DIR="${SCRIPT_DIR}/data/models"
+ENV_FILE="${SCRIPT_DIR}/.env"
+PIDFILE="__PIDFILE_DIR__/aria2c-model.pid"
+TARGET_MODEL="__TARGET_MODEL__"
+warn() { echo -e "\033[1;33m[!]\033[0m $*" >&2; }
+
+compose_cmd() {
+  if docker compose version &>/dev/null 2>&1; then
+    echo "docker compose"
+  elif command -v docker-compose &>/dev/null; then
+    echo "docker-compose"
+  else
+    echo "docker restart"
+  fi
+}
+
+is_download_running() {
+  [[ ! -f "$PIDFILE" ]] && return 1
+  local pid
+  pid=$(cat "$PIDFILE" 2>/dev/null || echo "") # stderr expected: pidfile can be unreadable/missing during shutdown race
+  [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: "No such process" while download exits
+}
+
+swap_model() {
+  local new_model="$1"
+  local old_model
+  old_model=$(grep '^GGUF_FILE=' "$ENV_FILE" | cut -d= -f2 | tr -d '"' || echo "")
+  [[ "$new_model" == "$old_model" ]] && return 0
+
+  # Convert GGUF filename -> Dream model id used by other services.
+  # Example: Qwen3-30B-A3B-Q4_K_M.gguf -> qwen3-30b-a3b
+  local new_llm_model
+  new_llm_model=$(echo "$new_model" \
+    | sed -E 's/\.(gguf|GGUF)$//' \
+    | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' \
+    | tr '[:upper:]' '[:lower:]')
+
+  # Validate new model file before swapping
+  local model_path="${MODEL_DIR}/${new_model}"
+  if [[ ! -f "$model_path" ]]; then
+    warn "Model file not found: ${model_path} — skipping swap"
+    return 1
+  fi
+  local file_size
+  file_size=$(stat -c%s "$model_path" 2>/dev/null || echo 0) # stderr expected: file can disappear during concurrent cleanup
+  if [[ "$file_size" -lt 100000000 ]]; then
+    warn "Model file too small (${file_size} bytes) — skipping swap"
+    return 1
+  fi
+
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Swapping: ${old_model} -> ${new_model} ($(( file_size / 1048576 )) MB)"
+  # [FIX: tmpfile-race] Use sed -i to avoid world-readable temp file with secrets
+  sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${new_model}|" "$ENV_FILE"
+  if grep -q '^LLM_MODEL=' "$ENV_FILE"; then
+    sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${new_llm_model}|" "$ENV_FILE"
+  else
+    echo "LLM_MODEL=${new_llm_model}" >> "$ENV_FILE"
+  fi
+
+  # Update model size for VRAM budget calculations
+  local new_size_mb
+  new_size_mb=$(stat -c%s "$model_path" 2>/dev/null || echo 0) # stderr expected: file can disappear during cleanup
+  new_size_mb=$(( new_size_mb / 1048576 ))
+  sed -i "s|^LLM_MODEL_SIZE_MB=.*|LLM_MODEL_SIZE_MB=${new_size_mb}|" "$ENV_FILE"
+  if ! grep -q '^LLM_MODEL_SIZE_MB=' "$ENV_FILE"; then
+    echo "LLM_MODEL_SIZE_MB=${new_size_mb}" >> "$ENV_FILE"
+  fi
+
+  # Use compose recreate (re-reads .env) instead of docker restart (ignores .env changes)
+  local cmd
+  cmd=$(compose_cmd)
+  if [[ "$cmd" == "docker compose" ]]; then
+    # [NON-FATAL: service] Llama restart can be retried if compose fails.
+    cd "$SCRIPT_DIR" && docker compose up -d llama-server || warn "compose recreate failed (non-fatal)"
+  elif [[ "$cmd" == "docker-compose" ]]; then
+    # [NON-FATAL: service] Llama restart can be retried if compose fails.
+    cd "$SCRIPT_DIR" && docker-compose up -d llama-server || warn "compose recreate failed (non-fatal)"
+  else
+    # [NON-FATAL: service] Restart failure should not block the watcher.
+    docker restart dream-llama-server || warn "llama-server restart failed (non-fatal)"
+  fi
+  # Restart dependent services so they pick up new model env / auto-detection.
+  for cname in dream-dreamforge dream-openclaw dream-dashboard-api; do
+    if docker ps --format '{{.Names}}' | grep -qx "$cname"; then
+      # [NON-FATAL: service] Dependent restarts are best-effort.
+      docker restart "$cname" || warn "${cname} restart failed (non-fatal)"
+    fi
+  done
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Swapped to ${new_model} — llama-server reloading"
+}
+
+while true; do
+  if ! is_download_running; then
+    if [[ -n "${TARGET_MODEL:-}" && -f "${MODEL_DIR}/${TARGET_MODEL}" ]]; then
+      swap_model "$TARGET_MODEL"
+    else
+      local_model=$(ls -S "${MODEL_DIR}"/*.gguf 2>&1 | head -1 | xargs -r basename || echo "")
+      if [[ -n "${local_model:-}" ]]; then
+        swap_model "$local_model"
+      fi
+    fi
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] Watcher exiting — download complete"
+    exit 0
+  fi
+  sleep 30
+done
+WATCHER_EOF
+
+  sed -i "s|__PIDFILE_DIR__|${pidfile_dir}|g" "$watcher_script"
+  sed -i "s|__TARGET_MODEL__|${model_name}|g" "$watcher_script"
+  chmod +x "$watcher_script"
+  nohup "$watcher_script" >> "${ds_dir}/logs/model-swap.log" 2>&1 &
+  local watcher_pid=$!
+  _store_pid "model-swap-watcher" "$watcher_pid"
+  log "Model swap watcher started (PID: ${watcher_pid})"
+}
diff --git a/dream-server/installers/p2p-gpu/lib/networking.sh b/dream-server/installers/p2p-gpu/lib/networking.sh
new file mode 100644
index 000000000..0a155efbc
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/networking.sh
@@ -0,0 +1,629 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Networking & Access Layer
+# ============================================================================
+# Part of: p2p-gpu/lib/
+# Purpose: Port exposure, Caddy reverse proxy, health page, Cloudflare
+#          tunnel, SSH tunnel script, access info display
+#
+# Expects: LOGFILE, SCRIPT_NAME, log(), warn(), err(), env_set(), env_get(),
+#          discover_all_services(), discover_service_ports()
+# Provides: expose_ports_for_vastai(), setup_reverse_proxy(),
+#           generate_health_page(), setup_cloudflare_tunnel(),
+#           generate_ssh_tunnel_script(), generate_powershell_tunnel_script(),
+#           print_access_info()
+#
+# Modder notes:
+#   Caddy failure is non-fatal — falls back to SSH tunnel mode.
+#   print_access_info is split into sub-functions for maintainability.
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# Rewrite 127.0.0.1 → 0.0.0.0 in compose port bindings for Vast.ai access
+expose_ports_for_vastai() {
+  local ds_dir="$1"
+  # Safety: only rebind on detected P2P GPU providers to avoid
+  # accidentally exposing services on non-rented machines.
+  if [[ -z "${VAST_TCP_PORT_22:-}" && -z "${PUBLIC_IPADDR:-}" \
+        && ! -f /etc/vast.ai && "${PROVIDER_NAME:-}" != "vastai" ]]; then
+    warn "Not a detected P2P GPU environment — skipping port rebinding"
+    return 0
+  fi
+  log "Rebinding Docker ports from 127.0.0.1 → 0.0.0.0 for Vast.ai external access"
+  local count=0
+  while IFS= read -r -d '' compose_file; do
+    if grep -q '"127\.0\.0\.1:' "$compose_file"; then
+      sed -i 's/"127\.0\.0\.1:/"0.0.0.0:/g' "$compose_file"
+      count=$((count + 1))
+    fi
+  # [NON-FATAL: discovery] Missing compose files just skips port rebinding.
+  done < <(find "$ds_dir" -maxdepth 4 \
+    \( -name "docker-compose*.yml" -o -name "compose*.yaml" -o -name "compose*.yml" \) \
+    -print0 2>&1 || warn "find compose files failed (non-fatal)")
+  log "Rebound ports in ${count} compose file(s) to 0.0.0.0"
+}
+
+# Deploy Caddy reverse proxy for single-port access
+setup_reverse_proxy() {
+  local ds_dir="$1"
+  local proxy_port="${2:-8080}"
+  local env_file="${ds_dir}/.env"
+
+  _install_caddy || return 1
+  _generate_caddyfile "$ds_dir" "$proxy_port" "$env_file"
+  _start_caddy "$ds_dir" "$proxy_port" "$env_file" || return 1
+  _wait_for_proxy_backend "$proxy_port"
+}
+
+_install_caddy() {
+  command -v caddy &>/dev/null && return 0
+  log "Installing Caddy reverse proxy..."
+  if apt-get install -y -qq debian-keyring debian-archive-keyring apt-transport-https 2>>"$LOGFILE" \
+    && curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' \
+       | gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg 2>>"$LOGFILE" \
+    && curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' \
+       | tee /etc/apt/sources.list.d/caddy-stable.list > /dev/null \
+    && apt-get update -qq 2>>"$LOGFILE" \
+    && apt-get install -y -qq caddy 2>>"$LOGFILE"; then
+    log "Caddy installed successfully"
+  else
+    warn "Caddy install failed — falling back to SSH tunnel mode"
+    return 1
+  fi
+}
+
+_generate_caddyfile() {
+  local ds_dir="$1" proxy_port="$2" env_file="$3"
+  local caddy_dir="${ds_dir}/config/caddy"
+  mkdir -p "$caddy_dir"
+
+  local webui_port
+  webui_port="$(env_get "$env_file" "WEBUI_PORT")"
+  webui_port="${webui_port:-3000}"
+
+  # Dashboard is the main DreamServer panel — it owns the root route.
+  # Its nginx serves SPA assets and proxies /api/ → dashboard-api with auth.
+  local dashboard_port
+  dashboard_port="$(env_get "$env_file" "DASHBOARD_PORT")"
+  dashboard_port="${dashboard_port:-3001}"
+
+  cat > "${caddy_dir}/Caddyfile" << CADDYEOF
+# DreamServer reverse proxy — auto-generated by vastai setup
+{
+  auto_https off
+  admin off
+}
+
+:${proxy_port} {
+  # Dashboard — main DreamServer panel (root route)
+  handle / {
+    reverse_proxy 127.0.0.1:${dashboard_port}
+  }
+  handle /api/* {
+    reverse_proxy 127.0.0.1:${dashboard_port}
+  }
+  handle /assets/* {
+    reverse_proxy 127.0.0.1:${dashboard_port}
+  }
+
+  # Open WebUI — full access via SSH tunnel (port ${webui_port})
+  handle_path /chat/* {
+    reverse_proxy 127.0.0.1:${webui_port}
+  }
+
+  handle_path /health {
+    root * ${caddy_dir}
+    file_server
+    try_files /health.html
+  }
+
+CADDYEOF
+
+  # Append auto-discovered service routes (skip root-handled services)
+  while IFS='|' read -r sid port_env port_def _name _cat proxy_mode _startup _cname; do
+    [[ -z "$port_env" || "$sid" == "open-webui" || "$sid" == "dashboard" ]] && continue
+    local svc_port
+    svc_port="$(env_get "$env_file" "$port_env")"
+    svc_port="${svc_port:-$port_def}"
+    [[ -z "$svc_port" ]] && continue
+
+    if [[ "$proxy_mode" == "root" ]]; then
+      printf '  handle /%s/* {\n    reverse_proxy 127.0.0.1:%s\n  }\n\n' \
+        "$sid" "$svc_port" >> "${caddy_dir}/Caddyfile"
+    else
+      printf '  handle_path /%s/* {\n    reverse_proxy 127.0.0.1:%s\n  }\n\n' \
+        "$sid" "$svc_port" >> "${caddy_dir}/Caddyfile"
+    fi
+  done < <(discover_all_services "$ds_dir")
+
+  # Ollama (base service, no manifest)
+  local ollama_port
+  ollama_port="$(env_get "$env_file" "OLLAMA_PORT")"
+  ollama_port="${ollama_port:-8080}"
+  cat >> "${caddy_dir}/Caddyfile" << CADDYTAIL
+  handle_path /ollama/* {
+    reverse_proxy 127.0.0.1:${ollama_port}
+  }
+  handle_path /v1/* {
+    reverse_proxy 127.0.0.1:${ollama_port}
+  }
+}
+CADDYTAIL
+
+  generate_health_page "${caddy_dir}/health.html" "$ds_dir"
+}
+
+_start_caddy() {
+  local ds_dir="$1" proxy_port="$2" env_file="$3"
+  local caddy_dir="${ds_dir}/config/caddy"
+
+  if pgrep -x caddy > /dev/null 2>&1; then
+    local old_pid
+    old_pid=$(pgrep -x caddy | head -1)
+    # [NON-FATAL: cleanup] Old proxy process may have already exited.
+    kill "$old_pid" || warn "could not kill old caddy PID ${old_pid} (non-fatal)"
+    sleep 1
+  fi
+
+  mkdir -p "${ds_dir}/logs"
+  nohup caddy run --config "${caddy_dir}/Caddyfile" --adapter caddyfile \
+    >> "${ds_dir}/logs/caddy-proxy.log" 2>&1 &
+  local caddy_pid=$!
+  sleep 2
+
+  if kill -0 "$caddy_pid" 2>&1; then
+    log "Caddy reverse proxy running on port ${proxy_port} (PID: ${caddy_pid})"
+    env_set "${env_file}" "REVERSE_PROXY_PORT" "$proxy_port"
+    return 0
+  else
+    warn "Caddy failed to start — check ${ds_dir}/logs/caddy-proxy.log"
+    return 1
+  fi
+}
+
+_wait_for_proxy_backend() {
+  local proxy_port="$1"
+  local elapsed=0 code="000"
+
+  while [[ "$elapsed" -lt 30 ]]; do
+    code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:${proxy_port}/" || echo "000")
+    if [[ "$code" =~ ^[23] ]]; then
+      return 0
+    fi
+    sleep 3
+    elapsed=$((elapsed + 3))
+  done
+
+  warn "Caddy is running on ${proxy_port}, but dashboard backend is not reachable yet (HTTP ${code})"
+  return 1
+}
+
+# Generate health dashboard HTML page
+generate_health_page() {
+  local output_file="$1"
+  cat > "$output_file" << 'HEALTHEOF'
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>DreamServer — Health</title>
+  <style>
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+           background: #0f1117; color: #e0e0e0; padding: 2rem; }
+    h1 { color: #7dd3fc; margin-bottom: 1.5rem; font-size: 1.5rem; }
+    .grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(240px, 1fr)); gap: 1rem; }
+    .card { background: #1a1d27; border-radius: 12px; padding: 1rem; border: 1px solid #2a2d37; }
+    .card h3 { font-size: 0.95rem; margin-bottom: 0.5rem; }
+    .status { display: inline-block; width: 10px; height: 10px; border-radius: 50%; margin-right: 8px; }
+    .healthy { background: #22c55e; } .running { background: #eab308; }
+    .stopped { background: #6b7280; } .error   { background: #ef4444; }
+    .port { color: #7dd3fc; font-size: 0.85rem; }
+    .ts { color: #6b7280; font-size: 0.8rem; margin-top: 1rem; }
+  </style>
+</head>
+<body>
+  <h1>DreamServer Health</h1>
+  <div class="grid" id="services"><div class="card"><p>Loading...</p></div></div>
+  <p class="ts">Auto-refreshes every 15s</p>
+  <script>
+    async function refresh() {
+      try {
+        const r = await fetch('/dashboard-api/api/v1/status');
+        const data = await r.json();
+        const grid = document.getElementById('services');
+        grid.innerHTML = '';
+        (data.services || []).forEach(s => {
+          const cls = s.healthy ? 'healthy' : (s.status === 'running' ? 'running' : 'stopped');
+          grid.innerHTML += '<div class="card"><h3><span class="status '+cls+'"></span>'+s.name+'</h3><span class="port">:'+( s.port||'—')+'</span></div>';
+        });
+      } catch(e) {
+        document.getElementById('services').innerHTML =
+          '<div class="card"><p>Dashboard API not available.</p></div>';
+      }
+    }
+    refresh(); setInterval(refresh, 15000);
+  </script>
+</body>
+</html>
+HEALTHEOF
+  log "Generated health dashboard at ${output_file}"
+}
+
+# Start Cloudflare Tunnel if token is configured
+setup_cloudflare_tunnel() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local cf_token
+  cf_token="$(env_get "$env_file" "CLOUDFLARE_TUNNEL_TOKEN")"
+  [[ -z "$cf_token" ]] && return 0
+
+  log "Cloudflare Tunnel token detected — setting up tunnel"
+  if ! command -v cloudflared &>/dev/null; then
+    local cf_tmp="/tmp/cloudflared-$$"
+    local cf_checksum_url="https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.sha256"
+    curl -sL https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 \
+      -o "$cf_tmp" || { warn "cloudflared download failed (non-fatal)"; rm -f "$cf_tmp"; return 0; }
+    # Verify checksum when available
+    local expected_sha
+    expected_sha=$(curl -sL --max-time 10 "$cf_checksum_url" 2>>"$LOGFILE" | awk '{print $1}' || echo "")
+    if [[ -n "$expected_sha" ]]; then
+      local actual_sha
+      actual_sha=$(sha256sum "$cf_tmp" | awk '{print $1}')
+      if [[ "$actual_sha" != "$expected_sha" ]]; then
+        err "cloudflared checksum mismatch (expected: ${expected_sha:0:12}…, got: ${actual_sha:0:12}…)"
+        rm -f "$cf_tmp"
+        warn "Skipping Cloudflare tunnel — binary integrity check failed"
+        return 0
+      fi
+      log "cloudflared checksum verified"
+    else
+      warn "cloudflared checksum not available — skipping integrity check"
+    fi
+    mv "$cf_tmp" /usr/local/bin/cloudflared
+    chmod +x /usr/local/bin/cloudflared
+  fi
+
+  local proxy_port
+  proxy_port="$(env_get "$env_file" "REVERSE_PROXY_PORT")"
+  proxy_port="${proxy_port:-3000}"
+
+  mkdir -p "${ds_dir}/logs"
+  # [FIX: cf-token] Pass token via env var, not CLI arg (hidden from ps aux)
+  TUNNEL_TOKEN="$cf_token" nohup cloudflared tunnel --no-autoupdate run --token-from-env TUNNEL_TOKEN \
+    >> "${ds_dir}/logs/cloudflared.log" 2>&1 &
+  local cf_pid=$!
+  # [NON-FATAL: pidfile] Missing pidfile only affects teardown cleanup.
+  _store_pid "cloudflared" "$cf_pid" 2>>"$LOGFILE" || warn "could not persist cloudflared pid (non-fatal)"
+  log "Cloudflare Tunnel started (PID: ${cf_pid}) — HTTPS access active"
+}
+
+# Returns 0 when the supplied IP is RFC1918/private or otherwise unusable as a public endpoint.
+_is_private_ip() {
+  local ip="$1"
+
+  [[ -z "$ip" ]] && return 0
+
+  case "$ip" in
+    10.*|172.1[6-9].*|172.2[0-9].*|172.3[0-1].*|192.168.*|169.254.*|127.*) return 0 ;;
+    *) return 1 ;;
+  esac
+}
+
+# Get Vast.ai SSH connection info with proper env var handling
+_get_vastai_ssh_info() {
+  local host_ip="" ssh_port=""
+
+  # Priority 1: Vast.ai publishes the authoritative public IP here.
+  host_ip="${PUBLIC_IPADDR:-}"
+  ssh_port="${VAST_TCP_PORT_22:-}"
+
+  # Priority 2: /proc/self/environ (handles SSH sessions that strip env vars)
+  if [[ -z "$host_ip" || -z "$ssh_port" ]]; then
+    if [[ -r /proc/self/environ ]]; then
+      if [[ -z "$host_ip" ]]; then
+        host_ip="$(tr '\0' '\n' < /proc/self/environ | grep '^PUBLIC_IPADDR=' | cut -d= -f2 || echo "")"
+      fi
+      if [[ -z "$ssh_port" ]]; then
+        ssh_port="$(tr '\0' '\n' < /proc/self/environ | grep '^VAST_TCP_PORT_22=' | cut -d= -f2 || echo "")"
+      fi
+    fi
+  fi
+
+  # Priority 3: /etc/environment (Vast.ai onstart may export vars here)
+  if [[ -z "$host_ip" && -f /etc/environment ]]; then
+    host_ip="$(grep '^PUBLIC_IPADDR=' /etc/environment 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "")"  # stderr expected: file may be absent or unreadable
+  fi
+  if [[ -z "$ssh_port" && -f /etc/environment ]]; then
+    ssh_port="$(grep '^VAST_TCP_PORT_22=' /etc/environment 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "")"  # stderr expected: file may be absent or unreadable
+  fi
+
+  # Discard any detected private/NAT address.
+  if _is_private_ip "$host_ip"; then
+    host_ip=""
+  fi
+
+  # Priority 4: External IP detection (reliable fallback)
+  if [[ -z "$host_ip" ]]; then
+    host_ip="$(curl -sf --max-time 5 ifconfig.me 2>>"$LOGFILE" || curl -sf --max-time 5 icanhazip.com 2>>"$LOGFILE" || echo '<your-vast-ip>')"
+  fi
+  if [[ -z "$ssh_port" ]]; then
+    ssh_port="22"
+  fi
+
+  echo "${host_ip}|${ssh_port}"
+}
+
+# Generate auto-reconnecting SSH tunnel script
+generate_ssh_tunnel_script() {
+  local ds_dir="$1"
+  local host_ip ssh_port
+  IFS='|' read -r host_ip ssh_port <<< "$(_get_vastai_ssh_info)"
+
+  local env_file="${ds_dir}/.env"
+  local entry_port
+  entry_port="$(env_get "$env_file" "DASHBOARD_PORT")"
+  entry_port="${entry_port:-3001}"
+  local local_proxy_port="58080"
+
+  local script_path="${ds_dir}/connect-tunnel.sh"
+  {
+    echo '#!/usr/bin/env bash'
+    echo '# DreamServer — auto-reconnecting SSH tunnel (run on YOUR LOCAL machine)'
+    echo "HOST=\"${host_ip}\""
+    echo "SSH_PORT=\"${ssh_port}\""
+    echo "ENTRY_PORT=\"${entry_port}\""
+    echo '_uname="$(uname -s | tr "[:upper:]" "[:lower:]")"'
+    echo 'case "${_uname}" in'
+    echo "  mingw*|msys*|cygwin*) _default_local_proxy=${local_proxy_port} ;;"
+    echo "  *) _default_local_proxy=${local_proxy_port} ;;"
+    echo 'esac'
+    echo 'LOCAL_PROXY_PORT="${LOCAL_PROXY_PORT:-${_default_local_proxy}}"'
+    echo 'if [[ "${FULL_TUNNEL:-0}" == "1" ]]; then'
+    echo '  FORWARDS="-L ${LOCAL_PROXY_PORT}:127.0.0.1:${ENTRY_PORT}'
+    discover_service_ports "$ds_dir" | while IFS='|' read -r key port _label; do
+      [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue
+      local_port="$port"
+      [[ "$port" -lt 1024 ]] && local_port=$((10000 + port))
+      echo "    -L ${local_port}:127.0.0.1:${port}"
+    done
+    echo '  "'
+    echo 'else'
+    echo '  FORWARDS="-L ${LOCAL_PROXY_PORT}:127.0.0.1:${ENTRY_PORT}"'
+    echo 'fi'
+    echo 'DELAY=5'
+    echo 'while true; do'
+    echo '  ssh -N -o ServerAliveInterval=15 -o ServerAliveCountMax=3 \'
+    echo '      -o ExitOnForwardFailure=yes \'
+    echo '      -p "$SSH_PORT" $FORWARDS root@"$HOST"'
+    echo '  echo "[!] Connection lost. Reconnecting in ${DELAY}s..."'
+    echo '  sleep "$DELAY"'
+    echo '  DELAY=$(( DELAY < 60 ? DELAY * 2 : 60 ))'
+    echo 'done'
+  } > "$script_path"
+  chmod +x "$script_path"
+  log "Generated auto-reconnecting tunnel script: ${script_path}"
+}
+
+generate_powershell_tunnel_script() {
+  local ds_dir="$1"
+  local host_ip ssh_port
+  IFS='|' read -r host_ip ssh_port <<< "$(_get_vastai_ssh_info)"
+
+  local env_file="${ds_dir}/.env"
+  local entry_port
+  entry_port="$(env_get "$env_file" "DASHBOARD_PORT")"
+  entry_port="${entry_port:-3001}"
+  local local_proxy_port="58080"
+
+  local script_path="${ds_dir}/connect-tunnel.ps1"
+  {
+    cat << POWERSHELL_HEAD
+param(
+  [int]\$LocalProxyPort = ${local_proxy_port},
+  [int]\$ReconnectDelay = 5,
+  [string]\$Host = "${host_ip}",
+  [int]\$SshPort = ${ssh_port}
+)
+
+\$EntryPort = ${entry_port}
+while (\$true) {
+  \$Forwards = @(
+    "-L"; "\${LocalProxyPort}:127.0.0.1:\$EntryPort";
+POWERSHELL_HEAD
+    discover_service_ports "$ds_dir" | while IFS='|' read -r key port _label; do
+      [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue
+      lp="$port"
+      [[ "$port" -lt 1024 ]] && lp=$((10000 + port))
+      printf '    "-L"; "%s:127.0.0.1:%s";\n' "$lp" "$port"
+    done
+    cat << 'POWERSHELL_TAIL'
+  )
+  ssh -N -o ServerAliveInterval=15 -o ServerAliveCountMax=3 -o ExitOnForwardFailure=yes -p $SshPort @Forwards "root@$Host"
+  Write-Host "[!] Connection lost. Reconnecting in ${ReconnectDelay}s..."
+  Start-Sleep -Seconds $ReconnectDelay
+  if ($ReconnectDelay -lt 60) {
+    $ReconnectDelay = [Math]::Min($ReconnectDelay * 2, 60)
+  }
+}
+POWERSHELL_TAIL
+  } > "$script_path"
+  log "Generated PowerShell tunnel script: ${script_path}"
+}
+
+# ── Print access info (split into sub-functions) ───────────────────────────
+print_access_info() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local host_ip ssh_port
+  local dash_api_status dashboard_status webui_status
+  IFS='|' read -r host_ip ssh_port <<< "$(_get_vastai_ssh_info)"
+  dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist
+  dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist
+  webui_status=$(
+    docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null || # stderr expected: container may not exist
+    docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null || # stderr expected: container may not exist
+    echo "missing"
+  )
+
+  echo ""
+  if [[ "$dash_api_status" == "running" && ( "$dashboard_status" == "running" || "$webui_status" == "running" ) ]]; then
+    echo -e "${CYAN}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo -e "${CYAN}${BOLD}  DreamServer is ready on Vast.ai!${NC}"
+    echo -e "${CYAN}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+  else
+    echo -e "${YELLOW}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo -e "${YELLOW}${BOLD}  DreamServer access info (core services still starting)${NC}"
+    echo -e "${YELLOW}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+  fi
+  echo ""
+  echo -e "${BOLD}Working directory:${NC} ${ds_dir}"
+  echo -e "${BOLD}Setup log:${NC}         ${LOGFILE}"
+  echo ""
+
+  _print_ssh_section "$ds_dir" "$env_file" "$host_ip" "$ssh_port"
+  _print_service_list "$ds_dir"
+  _print_model_upload_help "$ds_dir" "$host_ip" "$ssh_port"
+  _print_commands_help "$ds_dir"
+}
+
+_print_proxy_section() {
+  local ds_dir="$1" env_file="$2" host_ip="$3"
+  local proxy_port root_code proxy_ready="false"
+  proxy_port="$(env_get "$env_file" "REVERSE_PROXY_PORT")"
+  if [[ -n "$proxy_port" ]] && pgrep -x caddy > /dev/null 2>&1; then
+    root_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:${proxy_port}/" || echo "000")
+    [[ "$root_code" =~ ^[23] ]] && proxy_ready="true"
+
+    # On Vast.ai, the container's internal port is remapped to a random
+    # external port exposed in VAST_TCP_PORT_<internal>. If that var exists,
+    # the public URL must use the mapped port — not the internal one.
+    local ext_port_var="VAST_TCP_PORT_${proxy_port}"
+    local ext_port="${!ext_port_var:-}"
+    if [[ "$proxy_ready" == "true" ]]; then
+      echo -e "${GREEN}${BOLD}▸ Reverse Proxy Active (single-port access!)${NC}"
+    else
+      echo -e "${YELLOW}${BOLD}▸ Reverse proxy process is running (backends still starting)${NC}"
+    fi
+    echo ""
+    if [[ -n "$ext_port" ]]; then
+      echo -e "  ${BOLD}Public (Vast.ai mapped):${NC}"
+      echo -e "    Dashboard:    ${BOLD}http://${host_ip}:${ext_port}/${NC}"
+      echo "    Open WebUI:     http://${host_ip}:${ext_port}/chat/"
+      echo "    n8n Workflows:  http://${host_ip}:${ext_port}/n8n/"
+      echo "    Health Status:  http://${host_ip}:${ext_port}/health"
+      echo ""
+      echo -e "  ${DIM}(Internal port ${proxy_port} is remapped by Vast.ai to ${ext_port}.)${NC}"
+    else
+      echo -e "  Dashboard:    ${BOLD}http://${host_ip}:${proxy_port}/${NC}"
+      echo "    Open WebUI:     http://${host_ip}:${proxy_port}/chat/"
+      echo "    n8n Workflows:  http://${host_ip}:${proxy_port}/n8n/"
+      echo "    Health Status:  http://${host_ip}:${proxy_port}/health"
+      echo ""
+      echo -e "  ${YELLOW}Note:${NC} no VAST_TCP_PORT_${proxy_port} env var was found."
+      echo -e "  If the URL above is unreachable, the Vast.ai instance didn't expose"
+      echo -e "  port ${proxy_port}. Either:"
+      echo -e "    • Edit the instance → add ${proxy_port} to 'On-start script' port list, or"
+      echo -e "    • Use the SSH tunnel below (always works)."
+    fi
+
+    if [[ "$proxy_ready" != "true" ]]; then
+      echo ""
+      echo -e "  ${YELLOW}Warning:${NC} proxy is listening, but dashboard/open-webui backends are not healthy yet."
+      echo -e "  Run: ${BOLD}bash ${SCRIPT_NAME} --fix${NC}"
+    fi
+
+    echo ""
+  fi
+}
+
+_print_ssh_section() {
+  local ds_dir="$1" env_file="$2" host_ip="$3" ssh_port="$4"
+  echo -e "${BOLD}━━━ SSH Tunnel (recommended for Vast.ai) ━━━${NC}"
+  echo ""
+
+  local tunnel_flags=""
+  local remapped_notes=""
+  local entry_port windows_local_proxy_port
+  entry_port="$(env_get "$env_file" "DASHBOARD_PORT")"
+  entry_port="${entry_port:-3001}"
+  windows_local_proxy_port="58080"
+
+  while IFS='|' read -r key port _label; do
+    [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue
+    local local_port="$port"
+    if [[ "$port" -lt 1024 ]]; then
+      local_port=$((10000 + port))
+      remapped_notes="${remapped_notes}\n  ${DIM}  Port ${port} remapped to local ${local_port} (ports <1024 need admin)${NC}"
+    fi
+    tunnel_flags="${tunnel_flags} -L ${local_port}:127.0.0.1:${port}"
+  done < <(discover_service_ports "$ds_dir")
+
+  echo -e "  ${BOLD}Windows PowerShell (all ports, recommended):${NC}"
+  echo -e "${DIM}ssh -N -o ExitOnForwardFailure=yes -p ${ssh_port} -i \$env:USERPROFILE\\.ssh\\id_ed25519 -L ${windows_local_proxy_port}:127.0.0.1:${entry_port}${tunnel_flags} root@${host_ip}${NC}"
+  echo -e "${DIM}Open dashboard: http://127.0.0.1:3001/${NC}"
+  echo -e "${DIM}Easy alias:    http://127.0.0.1:${windows_local_proxy_port}/${NC}"
+  echo ""
+
+  echo -e "  ${BOLD}Linux / macOS (all ports):${NC}"
+  echo -e "${DIM}ssh -N -p ${ssh_port} -i ~/.ssh/id_ed25519 -L ${windows_local_proxy_port}:127.0.0.1:${entry_port}${tunnel_flags} root@${host_ip}${NC}"
+  echo -e "${DIM}Open dashboard: http://127.0.0.1:3001/${NC}"
+  echo ""
+
+  echo -e "  ${BOLD}Auto-reconnect scripts:${NC}"
+  echo -e "  ${DIM}Windows: scp -P ${ssh_port} root@${host_ip}:${ds_dir}/connect-tunnel.ps1 .${NC}"
+  echo -e "  ${DIM}         powershell -ExecutionPolicy Bypass -File .\\connect-tunnel.ps1${NC}"
+  echo -e "  ${DIM}Linux/macOS/WSL: scp -P ${ssh_port} root@${host_ip}:${ds_dir}/connect-tunnel.sh .${NC}"
+  echo -e "  ${DIM}                 FULL_TUNNEL=1 bash connect-tunnel.sh${NC}"
+  echo ""
+  echo -e "  ${DIM}If Windows reports \"bind [127.0.0.1]:PORT: Permission denied\",${NC}"
+  echo -e "  ${DIM}that local port is reserved by Hyper-V/WinNAT. Use a different local port:${NC}"
+  echo -e "  ${DIM}  -L 58080:127.0.0.1:${entry_port}   (or any free high port)${NC}"
+  echo -e "  ${DIM}Optional admin fix:${NC}"
+  echo -e "  ${DIM}  net stop winnat; net start winnat   (run PowerShell as admin)${NC}"
+  echo -e "  ${DIM}Check excluded ranges: netsh int ipv4 show excludedportrange protocol=tcp${NC}"
+  echo -e "  ${DIM}If you see \"channel N: open failed: connect failed: Connection refused\",${NC}"
+  echo -e "  ${DIM}the SSH tunnel is up, but that specific remote service is not listening yet.${NC}"
+  echo ""
+
+  if [[ -n "$remapped_notes" ]]; then
+    echo -e "${BOLD}  Remapped privileged ports:${NC}"
+    echo -e "$remapped_notes"
+    echo ""
+  fi
+
+}
+
+_print_service_list() {
+  local ds_dir="$1"
+  echo -e "${BOLD}Services:${NC}"
+  discover_service_ports "$ds_dir" | while IFS='|' read -r key port label; do
+    [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue
+    printf "  %-22s http://localhost:%s\n" "${label}:" "${port}"
+  done
+  echo ""
+}
+
+_print_model_upload_help() {
+  local ds_dir="$1" host_ip="$2" ssh_port="$3"
+  echo -e "${BOLD}Upload Custom Models:${NC}"
+  echo "  scp -P ${ssh_port} my-model.gguf root@${host_ip}:${ds_dir}/data/models/"
+  echo "  # Then: edit .env GGUF_FILE=my-model.gguf && docker restart dream-llama-server"
+  echo ""
+}
+
+_print_commands_help() {
+  local ds_dir="$1"
+  local script_name="${SCRIPT_NAME:-setup.sh}"
+  echo -e "${BOLD}Commands:${NC}"
+  echo "  bash ${script_name} --status     # Check health"
+  echo "  bash ${script_name} --info       # Connection details"
+  echo "  bash ${script_name} --fix        # Apply fixes + restart"
+  echo "  bash ${script_name} --resume     # Quick restart after SSH drop"
+  echo "  bash ${script_name} --teardown   # Stop all services"
+  echo ""
+}
diff --git a/dream-server/installers/p2p-gpu/lib/permissions.sh b/dream-server/installers/p2p-gpu/lib/permissions.sh
new file mode 100644
index 000000000..b665104f3
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/permissions.sh
@@ -0,0 +1,431 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Permission System
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/lib/
+# Purpose: POSIX ACLs, setgid, UID-specific ownership, data dir scaffolding
+#
+# Expects: DREAM_USER, DREAM_HOME, LOGFILE, log(), warn(), err()
+# Provides: ensure_acl_tools(), apply_data_acl(), apply_multi_uid_perms(),
+#           fix_known_uid_requirements(), precreate_extension_data_dirs(),
+#           configure_dream_umask(), create_permission_fix_script()
+#
+# Modder notes:
+#   Three-layer permission system:
+#     1. POSIX ACLs with default entries on data/
+#     2. Setgid bit (2775) on directories
+#     3. Known UID overrides for services that check ownership at startup
+#
+#   [FIX: shared-acl] Permission strategy:
+#     - Primary: setgid (2775) + POSIX ACLs → group-based access
+#     - Shared dirs get explicit per-UID ACLs for the writers we know about
+#     - setfacl is required; fail fast when unavailable
+#
+#   Error handling — two tiers:
+#     HARD-FAIL (exit 1): setfacl application, acl package install,
+#       primary chown/chmod on data dirs — if these fail the stack
+#       cannot start safely.
+#     WARN-AND-CONTINUE (|| warn): service-specific chown for individual
+#       extensions (qdrant, whisper, dashboard-api) — one service failing
+#       ownership should not prevent the other 16 from starting. Also used
+#       for UID extraction (parse helper) and generated repair scripts
+#       (which should fix as much as possible per run).
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# Install ACL tools if missing
+ensure_acl_tools() {
+  if ! command -v setfacl &>/dev/null; then
+    if ! apt-get install -y -qq acl 2>>"$LOGFILE"; then
+      err "Failed to install acl package — setfacl is required. Install manually: apt-get install acl"
+      exit 1
+    fi
+  fi
+}
+
+# Apply POSIX ACLs + setgid so every container UID can coexist.
+# This is the PRIMARY permission mechanism — covers most services.
+apply_data_acl() {
+  local dir="$1"
+  [[ ! -d "$dir" ]] && return 0
+
+  if ! chown -R "${DREAM_USER}:${DREAM_USER}" "$dir"; then
+    err "chown failed on ${dir} — cannot set base ownership for data directory"
+    exit 1
+  fi
+  if ! find "$dir" -type d -exec chmod 2775 {} +; then
+    err "chmod dirs failed on ${dir} — cannot set setgid on data directories"
+    exit 1
+  fi
+  if ! find "$dir" -type f -exec chmod 0664 {} +; then
+    err "chmod files failed on ${dir} — cannot set group-writable on data files"
+    exit 1
+  fi
+
+  if ! command -v setfacl &>/dev/null; then
+    err "setfacl unavailable — install with: apt-get install acl"
+    exit 1
+  fi
+
+  # dashboard-api runs as uid 1000 (dreamer) and needs write access to /data
+  # for .extensions-lock and token_counter.json.
+  if ! setfacl -R -d -m "u::rwx,u:1000:rwx,g::rwx,o::rx" "$dir"; then
+    err "Failed to apply default ACLs on ${dir} — mount may be ACL-incompatible"
+    exit 1
+  fi
+  if ! setfacl -R -m "u:1000:rwx,g::rwx" "$dir"; then
+    err "Failed to apply current ACLs on ${dir} — mount may be ACL-incompatible"
+    exit 1
+  fi
+  log "Applied POSIX ACLs on ${dir}"
+}
+
+# [FIX: shared-acl] Apply explicit ACLs to directories with multiple writers.
+# The caller must name the additional UIDs that need write access.
+apply_multi_uid_perms() {
+  local dir="$1" reason="$2"
+  shift 2
+  [[ ! -d "$dir" ]] && return 0
+
+  if ! chown -R "${DREAM_USER}:${DREAM_USER}" "$dir"; then
+    err "chown failed on ${dir} — cannot set base ownership for shared directory"
+    exit 1
+  fi
+  if ! find "$dir" -type d -exec chmod 2775 {} +; then
+    err "chmod dirs failed on ${dir} — cannot set setgid on shared directories"
+    exit 1
+  fi
+  if ! find "$dir" -type f -exec chmod 0664 {} +; then
+    err "chmod files failed on ${dir} — cannot set group-writable on shared files"
+    exit 1
+  fi
+
+  if ! command -v setfacl &>/dev/null; then
+    err "setfacl unavailable — install with: apt-get install acl"
+    exit 1
+  fi
+
+  local acl_suffix=""
+  if [[ $# -gt 0 ]]; then
+    acl_suffix=",$*"
+  fi
+
+  if ! setfacl -R -d -m "u::rwx,g::rwx,o::rx${acl_suffix}" "$dir"; then
+    err "Failed to apply shared default ACLs on ${dir} — mount may be ACL-incompatible"
+    exit 1
+  fi
+  if ! setfacl -R -m "u::rwx,g::rwx${acl_suffix}" "$dir"; then
+    err "Failed to apply shared current ACLs on ${dir} — mount may be ACL-incompatible"
+    exit 1
+  fi
+  log "Applied shared ACLs on ${dir} (reason: ${reason})"
+}
+
+# Extract numeric UID from a compose.yaml user: directive
+_extract_compose_uid() {
+  local compose_file="$1"
+  [[ ! -f "$compose_file" ]] && return 0
+  # [NON-FATAL: discovery] One bad compose file should not block others.
+  python3 -c "
+import yaml, re, sys
+try:
+    data = yaml.safe_load(open(sys.argv[1]))
+    services = data.get('services') or {}
+    for sdef in services.values():
+        user = str(sdef.get('user', ''))
+        if not user: continue
+        resolved = re.sub(r'\\\$\{[A-Za-z_]+:-(\d+)\}', r'\1', user)
+        uid = resolved.split(':')[0].strip()
+        if uid.isdigit():
+            print(uid)
+            break
+except yaml.YAMLError as e:
+    print(f'YAML parse error in {sys.argv[1]}: {e}', file=sys.stderr)
+except OSError as e:
+    print(f'File read error {sys.argv[1]}: {e}', file=sys.stderr)
+" "$compose_file" || warn "UID extraction failed for ${compose_file} (non-fatal)"
+}
+
+# Fix UID-specific ownership that ACLs alone don't solve
+fix_known_uid_requirements() {
+  local data_dir="$1"
+  local gpu_backend="${2:-nvidia}"
+  local ds_dir
+  ds_dir=$(dirname "$data_dir")
+
+  _fix_dynamic_uids "$ds_dir" "$data_dir"
+  _fix_uid_exceptions "$data_dir" "$gpu_backend"
+
+  log "Fixed UID-specific ownership for services (dynamic + exceptions)"
+}
+
+_fix_dynamic_uids() {
+  local ds_dir="$1" data_dir="$2"
+  local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions")
+  local dream_uid
+  dream_uid=$(id -u "$DREAM_USER" 2>>"$LOGFILE" || echo "")
+  for ext_root in "${ext_dirs[@]}"; do
+    [[ ! -d "$ext_root" ]] && continue
+    for ext_path in "${ext_root}"/*/; do
+      [[ ! -d "$ext_path" ]] && continue
+      local ext_name
+      ext_name=$(basename "$ext_path")
+      local ext_data="${data_dir}/${ext_name}"
+      local compose_file=""
+      for candidate in "${ext_path}compose.yaml" "${ext_path}compose.yml"; do
+        [[ -f "$candidate" ]] && compose_file="$candidate" && break
+      done
+      [[ -z "$compose_file" ]] && continue
+      local uid
+      uid=$(_extract_compose_uid "$compose_file")
+      if [[ -n "$uid" && "$uid" != "0" ]]; then
+        mkdir -p "$ext_data"
+        # best-effort: one extension failing ownership should not block others
+        if [[ -n "$dream_uid" && "$uid" == "$dream_uid" ]]; then
+          continue
+        fi
+        if ! chown -R "${uid}:${uid}" "$ext_data" 2>>"$LOGFILE"; then
+          warn "chown ${ext_name} to uid ${uid} failed (non-fatal) — attempting ACL fallback"
+          if command -v setfacl &>/dev/null; then
+            # [NON-FATAL: ${ext_name}] Individual service failure does not block others.
+            setfacl -R -m "u:${uid}:rwx" "$ext_data" 2>>"$LOGFILE" \
+              || warn "setfacl ${ext_name} uid ${uid} failed (non-fatal)"
+            # [NON-FATAL: ${ext_name}] Individual service failure does not block others.
+            setfacl -R -d -m "u:${uid}:rwx" "$ext_data" 2>>"$LOGFILE" \
+              || warn "setfacl default ${ext_name} uid ${uid} failed (non-fatal)"
+          fi
+        fi
+      fi
+    done
+  done
+}
+
+_fix_uid_exceptions() {
+  local data_dir="$1" gpu_backend="$2"
+
+  # qdrant: uid 1000, no user: in compose.yaml — explicit chown required
+  if [[ -d "${data_dir}/qdrant" ]]; then
+    # best-effort: qdrant-specific ownership — does not block other services
+    # [NON-FATAL: qdrant] Individual service failure does not block others.
+    chown -R 1000:1000 "${data_dir}/qdrant" || warn "qdrant ownership fix failed (non-fatal)"
+  fi
+
+  # searxng: uid varies by image version (977 or 1000) — grant both known UIDs
+  if [[ -d "${data_dir}/searxng" ]]; then
+    apply_multi_uid_perms "${data_dir}/searxng" "uid varies by image version (977/1000)" "u:977:rwx,u:1000:rwx"
+  fi
+
+  # comfyui: AMD vs NVIDIA layout
+  fix_comfyui_permissions "$data_dir" "$gpu_backend"
+
+  # open-webui: grant both root (container) and uid 1000 (dream/dashboard-api)
+  if [[ -d "${data_dir}/open-webui" ]]; then
+    if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "${data_dir}/open-webui"; then
+      err "Failed to apply default ACLs on ${data_dir}/open-webui — mount may be ACL-incompatible"
+      exit 1
+    fi
+    if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "${data_dir}/open-webui"; then
+      err "Failed to apply ACLs on ${data_dir}/open-webui — mount may be ACL-incompatible"
+      exit 1
+    fi
+  fi
+
+  # whisper: grant known writers uid 1000 + root for cache/bootstrap flows
+  if [[ -d "${data_dir}/whisper" ]]; then
+    # best-effort: whisper ownership — ACLs above enforce access regardless
+    # [NON-FATAL: whisper] Individual service failure does not block others.
+    chown -R 1000:1000 "${data_dir}/whisper" || warn "whisper chown failed (non-fatal)"
+    if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "${data_dir}/whisper"; then
+      err "Failed to apply default ACLs on ${data_dir}/whisper — mount may be ACL-incompatible"
+      exit 1
+    fi
+    if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "${data_dir}/whisper"; then
+      err "Failed to apply ACLs on ${data_dir}/whisper — mount may be ACL-incompatible"
+      exit 1
+    fi
+  fi
+
+  # dashboard-api: uid 1000 (dreamer) — needs rw on data/ and .env
+  local ds_dir
+  ds_dir=$(dirname "$data_dir")
+  if [[ -d "${data_dir}/dashboard-api" ]]; then
+    # best-effort: dashboard-api ownership — service starts as uid 1000 regardless
+    # [NON-FATAL: dashboard-api] Individual service failure does not block others.
+    chown -R 1000:1000 "${data_dir}/dashboard-api" || warn "dashboard-api chown failed (non-fatal)"
+  fi
+  if command -v setfacl &>/dev/null && [[ -f "${ds_dir}/.env" ]]; then
+    if ! setfacl -m u:1000:rw "${ds_dir}/.env"; then
+      err "Failed to apply ACL on ${ds_dir}/.env for dashboard-api"
+      exit 1
+    fi
+  fi
+
+  # models (shared): grant the non-root writer used by the p2p-gpu toolkit
+  if [[ -d "${data_dir}/models" ]]; then
+    apply_multi_uid_perms "${data_dir}/models" "multi-service write: llama-server, comfyui, aria2c" "u:1000:rwx"
+  fi
+}
+
+# Pre-create data directories for all known extensions
+precreate_extension_data_dirs() {
+  local ds_dir="$1"
+  local data_dir="${ds_dir}/data"
+  local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions")
+
+  for ext_root in "${ext_dirs[@]}"; do
+    [[ ! -d "$ext_root" ]] && continue
+    for manifest in "${ext_root}"/*/manifest.yaml; do
+      [[ ! -f "$manifest" ]] && continue
+      local ext_name
+      ext_name=$(basename "$(dirname "$manifest")")
+      mkdir -p "${data_dir}/${ext_name}"
+    done
+  done
+
+  # Pre-create ComfyUI bind-mount paths so Docker doesn't auto-create root-owned
+  # 0755 directories that are unwritable for the non-root comfyui user.
+  mkdir -p "${data_dir}/comfyui/models" \
+    "${data_dir}/comfyui/models/checkpoints" \
+    "${data_dir}/comfyui/output" \
+    "${data_dir}/comfyui/input" \
+    "${data_dir}/comfyui/workflows" \
+    "${data_dir}/comfyui/ComfyUI/models" \
+    "${data_dir}/comfyui/ComfyUI/output" \
+    "${data_dir}/comfyui/ComfyUI/input" \
+    "${data_dir}/comfyui/ComfyUI/custom_nodes"
+
+  # [NON-FATAL: extensions] Optional user-extensions directory.
+  mkdir -p "${ds_dir}/user-extensions" || warn "could not create user-extensions (non-fatal)"
+  log "Pre-created data directories for all known extensions"
+}
+
+# Set dream user's umask for group-writable files
+configure_dream_umask() {
+  for f in "${DREAM_HOME}/.bashrc" "${DREAM_HOME}/.profile"; do
+    if [[ -f "$f" ]] && ! grep -q 'umask 0002' "$f"; then
+      printf '\n# DreamServer: group-writable files by default\numask 0002\n' >> "$f"
+    fi
+  done
+}
+
+# Generate standalone permission-fix script
+create_permission_fix_script() {
+  local ds_dir="$1"
+  local uid_fix_lines=""
+
+  local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions")
+  for ext_root in "${ext_dirs[@]}"; do
+    [[ ! -d "$ext_root" ]] && continue
+    for ext_path in "${ext_root}"/*/; do
+      [[ ! -d "$ext_path" ]] && continue
+      local ext_name
+      ext_name=$(basename "$ext_path")
+      for candidate in "${ext_path}compose.yaml" "${ext_path}compose.yml"; do
+        [[ ! -f "$candidate" ]] && continue
+        local uid
+        uid=$(_extract_compose_uid "$candidate")
+        if [[ -n "$uid" && "$uid" != "0" ]]; then
+          # [NON-FATAL: fix-script] Generated fixer is best-effort by design.
+          uid_fix_lines+="[[ -d \"\${DATA_DIR}/${ext_name}\" ]] && chown -R ${uid}:${uid} \"\${DATA_DIR}/${ext_name}\" || warn \"${ext_name} chown failed (non-fatal)\""$'\n'
+        fi
+        break
+      done
+    done
+  done
+
+  mkdir -p "${ds_dir}/scripts"
+  cat > "${ds_dir}/scripts/fix-permissions.sh" << PERMFIX_EOF
+#!/usr/bin/env bash
+set -euo pipefail
+# DreamServer permission fixer — auto-generated, safe to run anytime.
+SCRIPT_DIR="\$(cd "\$(dirname "\$0")/.." && pwd)"
+DATA_DIR="\${SCRIPT_DIR}/data"
+warn() { echo -e "\033[1;33m[!]\033[0m \$*" >&2; }
+
+echo "[*] Fixing permissions on \${DATA_DIR}..."
+
+if ! command -v setfacl &>/dev/null; then
+  echo "[x] setfacl unavailable — install with: apt-get install acl" >&2
+  exit 1
+fi
+
+find "\$DATA_DIR" -type d -exec chmod 2775 {} + || warn "chmod dirs failed (non-fatal)"
+find "\$DATA_DIR" -type f -exec chmod 0664 {} + || warn "chmod files failed (non-fatal)"
+if ! setfacl -R -d -m "u::rwx,u:1000:rwx,g::rwx,o::rx" "\$DATA_DIR"; then
+  echo "[x] Failed to apply default ACLs on \$DATA_DIR — mount may be ACL-incompatible" >&2
+  exit 1
+fi
+if ! setfacl -R -m "u:1000:rwx,g::rwx" "\$DATA_DIR"; then
+  echo "[x] Failed to apply current ACLs on \$DATA_DIR — mount may be ACL-incompatible" >&2
+  exit 1
+fi
+
+${uid_fix_lines}
+[[ -d "\${DATA_DIR}/qdrant" ]] && chown -R 1000:1000 "\${DATA_DIR}/qdrant" || warn "qdrant fix failed (non-fatal)"
+if [[ -d "\${DATA_DIR}/open-webui" ]]; then
+  if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/open-webui"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+  if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "\${DATA_DIR}/open-webui"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+fi
+[[ -d "\${DATA_DIR}/whisper" ]] && chown -R 1000:1000 "\${DATA_DIR}/whisper" || warn "whisper chown failed (non-fatal)"
+if [[ -d "\${DATA_DIR}/whisper" ]]; then
+  if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/whisper"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+  if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "\${DATA_DIR}/whisper"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+fi
+# Multi-UID directories: searxng (uid varies), models (non-root writer)
+if [[ -d "\${DATA_DIR}/searxng" ]]; then
+  if ! setfacl -R -d -m "u::rwx,u:977:rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/searxng"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+  if ! setfacl -R -m "u:977:rwx,u:1000:rwx,g::rwx" "\${DATA_DIR}/searxng"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+fi
+if [[ -d "\${DATA_DIR}/models" ]]; then
+  if ! setfacl -R -d -m "u::rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/models"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+  if ! setfacl -R -m "u:1000:rwx,g::rwx" "\${DATA_DIR}/models"; then
+    echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2
+    exit 1
+  fi
+fi
+
+for d in \
+  "\${DATA_DIR}/comfyui/models" \
+  "\${DATA_DIR}/comfyui/models/checkpoints" \
+  "\${DATA_DIR}/comfyui/output" \
+  "\${DATA_DIR}/comfyui/input" \
+  "\${DATA_DIR}/comfyui/workflows" \
+  "\${DATA_DIR}/comfyui/ComfyUI/models" \
+  "\${DATA_DIR}/comfyui/ComfyUI/output" \
+  "\${DATA_DIR}/comfyui/ComfyUI/input" \
+  "\${DATA_DIR}/comfyui/ComfyUI/custom_nodes"; do
+  mkdir -p "\$d" || warn "comfyui mkdir failed on \$d (non-fatal)"
+  [[ -d "\$d" ]] && chmod 2775 "\$d" || warn "comfyui dir mode fix failed on \$d (non-fatal)"
+done
+
+find "\${SCRIPT_DIR}/scripts" -name "*.sh" -exec chmod +x {} + || warn "scripts chmod failed (non-fatal)"
+echo "[✓] Permissions fixed"
+PERMFIX_EOF
+
+  chmod +x "${ds_dir}/scripts/fix-permissions.sh"
+  log "Created reusable permission fixer: ${ds_dir}/scripts/fix-permissions.sh"
+}
diff --git a/dream-server/installers/p2p-gpu/lib/services.sh b/dream-server/installers/p2p-gpu/lib/services.sh
new file mode 100644
index 000000000..ced8fff77
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/lib/services.sh
@@ -0,0 +1,797 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Service Discovery & Management
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/lib/
+# Purpose: Manifest-driven service discovery, port enumeration, compose
+#          command detection, Docker image pre-pull, service startup
+#
+# Expects: DREAM_USER, LOGFILE, log(), warn(), err(), env_get(), env_set(),
+#          expose_ports_for_vastai()
+# Provides: read_manifest_field(), discover_all_services(),
+#           discover_service_ports(), extract_compose_uid(),
+#           get_compose_cmd(), start_services(), prepull_docker_images()
+#
+# Modder notes:
+#   Requires python3 + PyYAML (installed in Phase 1). Functions gracefully
+#   return empty when python3/PyYAML is unavailable.
+#   [FIX: python-except] Python catches only specific exceptions with logging.
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+# Resolve dream-network gateway for host-agent binding.
+_resolve_dream_network_gateway() {
+  local gateway
+  gateway=$(docker network inspect dream-network \
+    --format '{{(index .IPAM.Config 0).Gateway}}' 2>>"$LOGFILE" | head -1 || echo "")
+  gateway=$(echo "$gateway" | xargs)
+  if [[ "$gateway" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+    echo "$gateway"
+    return 0
+  fi
+  return 1
+}
+
+_is_loopback_addr() {
+  case "$1" in
+    ""|"127.0.0.1"|"localhost"|"::1") return 0 ;;
+    *) return 1 ;;
+  esac
+}
+
+_restart_host_agent() {
+  local ds_dir="$1"
+  local dream_cli="${ds_dir}/dream-cli"
+
+  if [[ ! -x "$dream_cli" ]]; then
+    warn "dream-cli not found at ${dream_cli} — skipping host agent restart"
+    return 1
+  fi
+
+  # [NON-FATAL: host-agent] Restart can be retried manually if it fails.
+  su - "$DREAM_USER" -c "cd ${ds_dir} && DREAM_HOME=${ds_dir} ./dream-cli agent restart" \
+    >> "$LOGFILE" 2>&1 || { warn "Host agent restart failed (non-fatal)"; return 1; }
+  return 0
+}
+
+# Ensure host agent binds to the Dream network gateway so containers can reach it.
+_ensure_host_agent_network_binding() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  [[ ! -f "$env_file" ]] && return 0
+
+  local gateway
+  gateway=$(_resolve_dream_network_gateway) || return 0
+
+  local bind host updated=false
+  bind="$(env_get "$env_file" "DREAM_AGENT_BIND")"
+  host="$(env_get "$env_file" "DREAM_AGENT_HOST")"
+
+  if _is_loopback_addr "$bind"; then
+    env_set "$env_file" "DREAM_AGENT_BIND" "$gateway"
+    updated=true
+  fi
+  if _is_loopback_addr "$host"; then
+    env_set "$env_file" "DREAM_AGENT_HOST" "$gateway"
+    updated=true
+  fi
+
+  if [[ "$updated" == "true" ]]; then
+    log "Pinned host agent binding to dream-network gateway ${gateway}"
+    _restart_host_agent "$ds_dir" || warn "Host agent restart after bind update failed (non-fatal)"
+  fi
+}
+
+# Generate a p2p-gpu compose overlay that fixes multi-GPU device reservation
+# and passes GPU assignment env vars that the upstream multigpu overlay omits.
+# Only generated when GPU_COUNT >= 2 and GPU_BACKEND is nvidia.
+# The overlay is appended to compose_flags so it merges on top of whatever
+# the upstream resolver selected.
+_generate_p2p_gpu_overlay() {
+  local ds_dir="$1"
+  local overlay="${ds_dir}/docker-compose.p2p-gpu.yml"
+  local backend="${GPU_BACKEND:-}"
+
+  if [[ "${GPU_COUNT:-0}" -lt 2 ]]; then
+    if [[ -f "$overlay" ]]; then
+      # [NON-FATAL: overlay] Cleanup failure should not block install flow.
+      rm -f "$overlay" || warn "Failed to remove p2p-gpu overlay (non-fatal)"
+    fi
+    return 0
+  fi
+
+  if [[ -z "$backend" ]]; then
+    backend=$(detect_gpu_backend)
+  fi
+
+  if [[ "$backend" == "nvidia" ]]; then
+    cat > "$overlay" << 'P2P_OVERLAY_EOF'
+# Auto-generated by p2p-gpu toolkit - do not edit manually.
+# Ensures multi-GPU device reservation and GPU assignment env vars.
+services:
+  llama-server:
+    environment:
+      LLAMA_ARG_MAIN_GPU: "${LLAMA_ARG_MAIN_GPU:-}"
+      GGML_CUDA_P2P: "${GGML_CUDA_P2P:-}"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+P2P_OVERLAY_EOF
+  elif [[ "$backend" == "amd" ]]; then
+    cat > "$overlay" << 'P2P_OVERLAY_EOF'
+# Auto-generated by p2p-gpu toolkit - do not edit manually.
+services:
+  llama-server:
+    environment:
+      LLAMA_ARG_TENSOR_SPLIT: "${LLAMA_ARG_TENSOR_SPLIT:-}"
+P2P_OVERLAY_EOF
+  else
+    if [[ -f "$overlay" ]]; then
+      # [NON-FATAL: overlay] Cleanup failure should not block install flow.
+      rm -f "$overlay" || warn "Failed to remove p2p-gpu overlay (non-fatal)"
+    fi
+    return 0
+  fi
+
+  if [[ -f "$overlay" ]]; then
+    # [NON-FATAL: overlay] Ownership fix failure should not block compose usage.
+    chown "${DREAM_USER}:${DREAM_USER}" "$overlay" || warn "Overlay ownership fix failed (non-fatal)"
+    # [NON-FATAL: overlay] Mode fix failure should not block compose usage.
+    chmod 0644 "$overlay" || warn "Overlay chmod failed (non-fatal)"
+    log "Generated p2p-gpu compose overlay: ${overlay}"
+  fi
+}
+
+# Ensure Dream host agent is running so Dashboard model downloads can start.
+_ensure_host_agent_running() {
+  local ds_dir="$1"
+  local dream_cli="${ds_dir}/dream-cli"
+  local agent_port agent_bind
+  local agent_probe
+
+  if [[ ! -x "$dream_cli" ]]; then
+    warn "dream-cli not found at ${dream_cli} — skipping host agent auto-start"
+    return 0
+  fi
+
+  agent_port="$(grep '^DREAM_AGENT_PORT=' "${ds_dir}/.env" 2>/dev/null | cut -d= -f2 | tr -d '[:space:]' || echo "7710")"  # stderr expected: .env may not exist
+  agent_port="${agent_port:-7710}"
+  agent_bind="$(grep '^DREAM_AGENT_BIND=' "${ds_dir}/.env" 2>/dev/null | cut -d= -f2 | tr -d '[:space:]' || echo "127.0.0.1")"  # stderr expected: .env may not exist
+  agent_bind="${agent_bind:-127.0.0.1}"
+  agent_probe="$agent_bind"
+  if [[ "$agent_probe" == "0.0.0.0" || "$agent_probe" == "::" ]]; then
+    agent_probe="127.0.0.1"
+  fi
+
+  if curl -sf --max-time 2 "http://${agent_probe}:${agent_port}/health" >/dev/null 2>&1; then
+    log "Host agent already running on port ${agent_port}"
+    return 0
+  fi
+
+  if ! command -v python3 &>/dev/null; then
+    warn "python3 not found — host agent cannot start"
+    warn "Install: apt-get install -y python3"
+    return 1
+  fi
+
+  local agent_script="${ds_dir}/bin/dream-host-agent.py"
+  if [[ ! -f "$agent_script" ]]; then
+    warn "Agent script not found at ${agent_script} — skipping"
+    return 1
+  fi
+
+  local attempt pid_file="${ds_dir}/data/dream-host-agent.pid" wait_elapsed
+  for attempt in 1 2; do
+    log "Starting host agent (attempt ${attempt}/2)..."
+    # [NON-FATAL: host-agent] Start failure can be retried or handled manually.
+    su - "$DREAM_USER" -c "cd ${ds_dir} && DREAM_HOME=${ds_dir} ./dream-cli agent start" \
+      >> "$LOGFILE" 2>&1 || warn "dream-cli agent start returned non-zero (attempt ${attempt})"
+
+    wait_elapsed=0
+    while [[ $wait_elapsed -lt 20 ]]; do
+      sleep 3
+      wait_elapsed=$((wait_elapsed + 3))
+      if curl -sf --max-time 2 "http://${agent_probe}:${agent_port}/health" >/dev/null 2>&1; then
+        log "Host agent verified running on port ${agent_port} (attempt ${attempt})"
+        return 0
+      fi
+    done
+
+    if [[ $attempt -eq 1 ]]; then
+      warn "Host agent not responding after start — retrying..."
+      if [[ -f "$pid_file" ]]; then
+        # [NON-FATAL: cleanup] Stale pid cleanup should not block host agent retry.
+        kill "$(cat "$pid_file")" 2>>"$LOGFILE" || warn "stale host agent pid in ${pid_file} could not be killed"
+        rm -f "$pid_file"
+      fi
+    fi
+  done
+
+  warn "Host agent failed to start after 2 attempts"
+  warn "Manual start: su - ${DREAM_USER} -c 'cd ${ds_dir} && DREAM_HOME=${ds_dir} ./dream-cli agent start'"
+  warn "Check logs: cat ${ds_dir}/data/dream-host-agent.log"
+  return 1
+}
+
+# Ensure OpenCode web is reachable on no-systemd hosts (Vast.ai fallback).
+_ensure_opencode_web_running() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local opencode_bin="/home/${DREAM_USER}/.opencode/bin/opencode"
+  local opencode_port opencode_password escaped_password launch_dir escaped_launch_dir
+
+  opencode_port=$(env_get "$env_file" "OPENCODE_PORT")
+  opencode_port="${opencode_port:-3003}"
+
+  if curl -sf --max-time 3 "http://127.0.0.1:${opencode_port}/" >/dev/null 2>&1; then
+    log "OpenCode web already reachable on port ${opencode_port}"
+    return 0
+  fi
+
+  if [[ ! -x "$opencode_bin" ]]; then
+    warn "OpenCode binary not found at ${opencode_bin} — skipping OpenCode web auto-start"
+    return 0
+  fi
+
+  opencode_password=$(env_get "$env_file" "OPENCODE_SERVER_PASSWORD")
+  if [[ -z "$opencode_password" ]]; then
+    opencode_password=$(openssl rand -base64 16)
+    env_set "$env_file" "OPENCODE_SERVER_PASSWORD" "$opencode_password"
+    log "Generated OPENCODE_SERVER_PASSWORD for secure OpenCode web access"
+  fi
+
+  launch_dir="$ds_dir"
+  if ! su - "$DREAM_USER" -c "test -r $(printf '%q' "$ds_dir") && test -x $(printf '%q' "$ds_dir")"; then
+    launch_dir="$DREAM_HOME"
+    warn "OpenCode launch dir ${ds_dir} is not accessible to ${DREAM_USER}; using ${launch_dir}"
+  fi
+
+  mkdir -p "${ds_dir}/logs"
+  escaped_password=$(printf '%q' "$opencode_password")
+  escaped_launch_dir=$(printf '%q' "$launch_dir")
+  if su - "$DREAM_USER" -c \
+    "cd ${escaped_launch_dir} && OPENCODE_SERVER_PASSWORD=${escaped_password} nohup ${opencode_bin} web --hostname 0.0.0.0 --port ${opencode_port} >> ${ds_dir}/logs/opencode-web.log 2>&1 &" \
+    >> "$LOGFILE" 2>&1; then
+    sleep 2
+    if curl -sf --max-time 4 "http://127.0.0.1:${opencode_port}/" >/dev/null 2>&1; then
+      log "Started OpenCode web fallback on port ${opencode_port}"
+    else
+      warn "OpenCode fallback launch command succeeded but service is not reachable yet"
+    fi
+  else
+    warn "OpenCode fallback launch failed (non-fatal)"
+  fi
+}
+
+_normalize_dashboard_api_port_envs() {
+  local env_file="$1"
+
+  [[ -f "$env_file" ]] || return 0
+
+  python3 - "$env_file" <<'PY'
+from pathlib import Path
+import re
+import sys
+
+path = Path(sys.argv[1])
+text = path.read_text()
+pattern = re.compile(r'^([A-Z0-9_]+_PORT)=(\d+)\s+#.*$')
+changed = []
+lines = []
+
+for line in text.splitlines():
+    match = pattern.match(line)
+    if match:
+      line = f"{match.group(1)}={match.group(2)}"
+      changed.append(match.group(1))
+    lines.append(line)
+
+new_text = "\n".join(lines) + ("\n" if text.endswith("\n") else "")
+if new_text != text:
+    path.write_text(new_text)
+    if changed:
+        print("\n".join(changed))
+PY
+}
+
+# Read a field from a manifest.yaml service: block
+read_manifest_field() {
+  local manifest="$1" field="$2"
+  # [NON-FATAL: discovery] A single bad manifest should not block others.
+  python3 -c "
+import yaml, sys
+try:
+    data = yaml.safe_load(open(sys.argv[1]))
+    svc = data.get('service') or {}
+    val = svc.get(sys.argv[2], '')
+    if isinstance(val, list):
+        print(' '.join(str(v) for v in val))
+    else:
+        print(val)
+except yaml.YAMLError as e:
+    print(f'YAML parse error in {sys.argv[1]}: {e}', file=sys.stderr)
+except OSError as e:
+  print(f'File read error {sys.argv[1]}: {e}', file=sys.stderr)
+" "$manifest" "$field" || warn "manifest field read failed for ${manifest}:${field} (non-fatal)"
+}
+
+# Discover all enabled services from extension manifests.
+# Usage: discover_all_services <ds_dir> [hints_file]
+# Output: ID|PORT_ENV|PORT_DEFAULT|NAME|CATEGORY|PROXY_MODE|STARTUP_BEHAVIOR|CONTAINER_NAME
+discover_all_services() {
+  local ds_dir="$1"
+  local hints_file="${2:-}"
+  if [[ -z "$hints_file" && -n "${SCRIPT_DIR:-}" ]]; then
+    hints_file="${SCRIPT_DIR}/config/service-hints.yaml"
+  fi
+  local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions")
+
+  for ext_root in "${ext_dirs[@]}"; do
+    [[ ! -d "$ext_root" ]] && continue
+    for manifest in "${ext_root}"/*/manifest.yaml; do
+      [[ ! -f "$manifest" ]] && continue
+          # [NON-FATAL: discovery] A single bad manifest should not block others.
+          python3 -c "import os, yaml, sys; data = yaml.safe_load(open(sys.argv[1])) or {}; svc = data.get('service') or {}; sid = svc.get('id', ''); port_env = svc.get('external_port_env', ''); port_def = svc.get('external_port_default', ''); name = svc.get('name', sid); cat = svc.get('category', 'optional'); hints = {}; hints_path = sys.argv[2] if len(sys.argv) > 2 else ''; hints = ((yaml.safe_load(open(hints_path)) or {}).get(sid, {}) if (hints_path and os.path.exists(hints_path) and sid) else {}); proxy = hints.get('proxy_mode', svc.get('proxy_mode', 'simple')); startup = hints.get('startup_behavior', svc.get('startup_behavior', 'normal')); cname = svc.get('container_name', ''); htimeout = svc.get('health_timeout', 0); startup = 'heavy' if startup == 'normal' and isinstance(htimeout, (int, float)) and htimeout > 20 else startup; print(f'{sid}|{port_env}|{port_def}|{name}|{cat}|{proxy}|{startup}|{cname}') if sid else None" "$manifest" "$hints_file" || warn "service discovery failed for ${manifest} (non-fatal)"
+    done
+  done
+}
+
+# Discover service ports from .env / manifests.
+# Output: SERVICE_KEY|PORT_NUMBER|LABEL
+# Reads explicit _PORT= lines from .env, then fills in manifest defaults
+# for any services whose port_env isn't already set.
+discover_service_ports() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local env_example="${ds_dir}/.env.example"
+
+  declare -A PORT_LABELS PORT_DEFAULTS SEEN_KEYS
+  while IFS='|' read -r _id port_env port_def svc_name _rest; do
+    [[ -z "$port_env" ]] && continue
+    PORT_LABELS["$port_env"]="$svc_name"
+    [[ -n "$port_def" ]] && PORT_DEFAULTS["$port_env"]="$port_def"
+  done < <(discover_all_services "$ds_dir")
+
+  local source_file="$env_file"
+  [[ ! -f "$source_file" ]] && source_file="$env_example"
+  [[ ! -f "$source_file" ]] && return 0
+
+  # Emit ports explicitly set in .env
+  awk -F= '/^[A-Z_]+_PORT=/{print}' "$source_file" | while IFS='=' read -r key value; do
+    value=$(echo "$value" | sed 's/[[:space:]]#.*$//' | tr -d '"' | tr -d "'" | xargs)
+    [[ -z "$value" ]] && continue
+    local label="${PORT_LABELS[$key]:-$key}"
+    echo "${key}|${value}|${label}"
+  done
+
+  # Track which keys were already emitted
+  while IFS= read -r key; do
+    SEEN_KEYS["$key"]=1
+  done < <(awk -F= '/^[A-Z_]+_PORT=/{print $1}' "$source_file")
+
+  # Fill in manifest defaults for services not in .env
+  for key in "${!PORT_DEFAULTS[@]}"; do
+    [[ -n "${SEEN_KEYS[$key]:-}" ]] && continue
+    local label="${PORT_LABELS[$key]:-$key}"
+    echo "${key}|${PORT_DEFAULTS[$key]}|${label}"
+  done
+}
+
+# Detect available compose command
+get_compose_cmd() {
+  if docker compose version &>/dev/null; then
+    echo "docker compose"
+  elif command -v docker-compose &>/dev/null; then
+    echo "docker-compose"
+  else
+    err "Neither 'docker compose' nor 'docker-compose' found"
+    exit 1
+  fi
+}
+
+# Pre-pull Docker images in parallel
+prepull_docker_images() {
+  local ds_dir="$1"
+  local max_parallel="${2:-4}"
+
+  local images
+  images=$(grep -rh 'image:' "${ds_dir}"/docker-compose*.yml \
+    "${ds_dir}"/extensions/services/*/compose*.y*ml 2>&1 \
+    | sed -E 's/.*image:\s*//' | tr -d '"' | tr -d "'" \
+    | sort -u | grep -v '^\$' || echo "")
+
+  if [[ -z "$images" ]]; then
+    log "No Docker images found to pre-pull"
+    return 0
+  fi
+
+  local count
+  count=$(echo "$images" | wc -l)
+  log "Pre-pulling ${count} Docker images (${max_parallel} parallel)..."
+
+  # [NON-FATAL: images] Images can be pulled later during compose up.
+  echo "$images" | xargs -P "$max_parallel" -I {} sh -c \
+    'docker pull {} >/dev/null 2>&1 && echo "  pulled: {}" || echo "  skip:   {} (will retry at compose up)"' \
+    || warn "some image pulls failed (non-fatal)"
+
+  log "Docker image pre-pull complete"
+}
+
+# ── Remove stale Docker network ────────────────────────────────────────────
+_cleanup_stale_network() {
+  if ! docker network inspect dream-network >/dev/null 2>&1; then
+    return 0
+  fi
+  local net_label
+  net_label=$(docker network inspect dream-network \
+    --format '{{index .Labels "com.docker.compose.network"}}' 2>&1 || echo "")
+  if [[ -n "$net_label" ]]; then
+    return 0
+  fi
+  log "Removing stale dream-network (missing compose labels)..."
+  for cid in $(docker network inspect dream-network \
+    -f '{{range .Containers}}{{.Name}} {{end}}' 2>&1 || echo ""); do
+    # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+    docker network disconnect -f dream-network "$cid" || warn "disconnect ${cid} failed (non-fatal)"
+  done
+  # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+  docker network rm dream-network || warn "network rm failed (non-fatal)"
+}
+
+_set_safe_llama_cpu_caps() {
+  local env_file="$1" max_cpu="$2"
+  [[ ! -f "$env_file" ]] && return 0
+
+  local llama_limit="${max_cpu}.0"
+  local llama_reservation="2.0"
+  if [[ "$max_cpu" -lt 2 ]]; then
+    llama_reservation="1.0"
+  fi
+
+  env_set "$env_file" "LLAMA_CPU_LIMIT" "$llama_limit"
+  env_set "$env_file" "LLAMA_CPU_RESERVATION" "$llama_reservation"
+}
+
+_extract_cpu_ceiling_from_compose_error() {
+  local compose_err="$1"
+  local ceiling=""
+
+  ceiling=$(tr -d '\r' < "$compose_err" | grep -Eo 'range of CPUs is from [0-9.]+ to [0-9.]+' 2>>"$LOGFILE" \
+    | head -1 | awk '{print $NF}' | cut -d'.' -f1 || echo "")
+
+  if [[ -z "$ceiling" ]]; then
+    ceiling=$(tr -d '\r' < "$compose_err" | grep -Eo 'only [0-9]+ CPUs available' 2>>"$LOGFILE" \
+      | head -1 | awk '{print $2}' || echo "")
+  fi
+
+  if [[ "$ceiling" =~ ^[0-9]+$ ]] && [[ "$ceiling" -gt 0 ]]; then
+    echo "$ceiling"
+  fi
+}
+
+_compose_output_has_cpu_error() {
+  local compose_err="$1"
+  tr -d '\r' < "$compose_err" | grep -Eqi "range of CPUs is from|only [0-9]+ CPUs available|invalid.*cpu|NanoCPUs"
+}
+
+_resolve_compose_files_from_flags() {
+  local ds_dir="$1" compose_flags="$2"
+  local prev="" token
+
+  for token in $compose_flags; do
+    if [[ "$prev" == "-f" ]]; then
+      if [[ "$token" == /* ]]; then
+        echo "$token"
+      else
+        echo "${ds_dir}/${token}"
+      fi
+      prev=""
+      continue
+    fi
+    [[ "$token" == "-f" ]] && prev="-f"
+  done
+}
+
+_compose_ansi_flag() {
+  local compose_cmd="$1"
+  case "$compose_cmd" in
+    "docker compose") echo "--ansi never" ;;
+    "docker-compose") echo "--no-ansi" ;;
+    *) echo "" ;;
+  esac
+}
+
+_compose_list_services() {
+  local ds_dir="$1" compose_cmd="$2" compose_flags="$3"
+  local ansi_flag cmd
+  ansi_flag=$(_compose_ansi_flag "$compose_cmd")
+  cmd="${compose_cmd}"
+  [[ -n "$ansi_flag" ]] && cmd="${cmd} ${ansi_flag}"
+  cmd="${cmd} ${compose_flags} config --services"
+
+  su - "$DREAM_USER" -c "cd ${ds_dir} && ${cmd}" 2>>"$LOGFILE"
+}
+
+_extract_missing_image_services() {
+  local compose_err="$1"
+  local matched_lines status=0
+  matched_lines=$(tr -d '\r' < "$compose_err" | grep -Ei 'Error manifest for|pull access denied for') || status=$?
+  # grep exit: 0 = matched, 1 = no match (expected), >1 = real error
+  if (( status > 1 )); then
+    warn "grep failed scanning compose stderr for missing-image errors (status ${status})"
+  fi
+  [[ -z "$matched_lines" ]] && return 0
+
+  local service
+  while IFS= read -r line; do
+    service=""
+    local cleaned="${line//\'/}"
+    cleaned="${cleaned//\"/}"
+    if [[ "$cleaned" =~ ^[[:space:]]*([a-zA-Z0-9._-]+)[[:space:]]+(Error[[:space:]]+manifest[[:space:]]+for|pull[[:space:]]+access[[:space:]]+denied[[:space:]]+for) ]]; then
+      service="${BASH_REMATCH[1]}"
+    elif [[ "$cleaned" =~ [Ss]ervice[[:space:]]*([a-zA-Z0-9._-]+) ]]; then
+      service="${BASH_REMATCH[1]}"
+    elif [[ "$cleaned" =~ ^([a-zA-Z0-9._-]+)[[:space:]]*[\|:] ]]; then
+      service="${BASH_REMATCH[1]}"
+    fi
+    [[ -n "$service" ]] && echo "$service"
+  done <<< "$matched_lines" | sort -u
+}
+
+_compose_up_with_flags() {
+  local ds_dir="$1" compose_cmd="$2" compose_flags="$3" compose_err="$4" up_flags="$5"
+  shift 5
+  local ansi_flag cmd service_args
+  ansi_flag=$(_compose_ansi_flag "$compose_cmd")
+  cmd="${compose_cmd}"
+  [[ -n "$ansi_flag" ]] && cmd="${cmd} ${ansi_flag}"
+  cmd="${cmd} ${compose_flags} up -d"
+  [[ -n "$up_flags" ]] && cmd="${cmd} ${up_flags}"
+  if [[ "$#" -gt 0 ]]; then
+    printf -v service_args ' %q' "$@"
+    cmd="${cmd}${service_args}"
+  fi
+
+  su - "$DREAM_USER" -c "cd ${ds_dir} && ${cmd}" 2>&1 \
+    | tee -a "$LOGFILE" | tee "$compose_err"
+}
+
+_apply_host_cpu_caps() {
+  local ds_dir="$1" env_file="$2" daemon_ceiling="${3:-}" compose_flags="${4:-}"
+  local nproc_count docker_ncpu compose_ceiling max_cpu
+  local -a compose_files=()
+
+  nproc_count=$(nproc 2>>"$LOGFILE" || echo 1)
+  docker_ncpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "unknown")
+  compose_ceiling=$(get_compose_cpu_ceiling)
+  max_cpu=$(compute_safe_cpu_cap "$daemon_ceiling")
+
+  cap_cpu_in_yaml "$ds_dir" "$max_cpu"
+  if [[ -n "$compose_flags" ]]; then
+    mapfile -t compose_files < <(_resolve_compose_files_from_flags "$ds_dir" "$compose_flags")
+    if [[ "${#compose_files[@]}" -gt 0 ]]; then
+      cap_cpu_in_files "$max_cpu" "${compose_files[@]}"
+    fi
+  fi
+  _set_safe_llama_cpu_caps "$env_file" "$max_cpu"
+  log "Ensured compose CPU limits <= ${max_cpu} cores (nproc=${nproc_count}, docker=${docker_ncpu}, ceiling=${compose_ceiling}${daemon_ceiling:+, daemon=${daemon_ceiling}})"
+}
+
+_compose_up() {
+  local ds_dir="$1" compose_cmd="$2" compose_flags="$3" compose_err="$4"
+  shift 4
+  local ansi_flag cmd service_args
+  ansi_flag=$(_compose_ansi_flag "$compose_cmd")
+  cmd="${compose_cmd}"
+  [[ -n "$ansi_flag" ]] && cmd="${cmd} ${ansi_flag}"
+  cmd="${cmd} ${compose_flags} up -d"
+  if [[ "$#" -gt 0 ]]; then
+    printf -v service_args ' %q' "$@"
+    cmd="${cmd}${service_args}"
+  fi
+
+  su - "$DREAM_USER" -c "cd ${ds_dir} && ${cmd}" 2>&1 \
+    | tee -a "$LOGFILE" | tee "$compose_err"
+}
+
+_compose_up_with_cpu_heal() {
+  local ds_dir="$1" compose_cmd="$2" compose_flags="$3" env_file="$4" scope="$5"
+  shift 5
+  local compose_err daemon_ceiling
+  compose_err=$(mktemp)
+
+  if _compose_up "$ds_dir" "$compose_cmd" "$compose_flags" "$compose_err" "$@"; then
+    rm -f "$compose_err"
+    return 0
+  fi
+
+  if _compose_output_has_cpu_error "$compose_err"; then
+    daemon_ceiling=$(_extract_cpu_ceiling_from_compose_error "$compose_err")
+    if [[ -n "$daemon_ceiling" ]]; then
+      warn "CPU limit exceeds daemon ceiling (${daemon_ceiling}) during ${scope} — recapping and retrying"
+    else
+      warn "CPU limit exceeds host/daemon cores during ${scope} — recapping and retrying"
+    fi
+    _apply_host_cpu_caps "$ds_dir" "$env_file" "$daemon_ceiling" "$compose_flags"
+    if _compose_up "$ds_dir" "$compose_cmd" "$compose_flags" "$compose_err" "$@"; then
+      rm -f "$compose_err"
+      return 0
+    fi
+  fi
+
+  local missing_services
+  missing_services=$(_extract_missing_image_services "$compose_err")
+  if [[ -n "$missing_services" ]]; then
+    local missing_list
+    missing_list="${missing_services//$'\n'/, }"
+    warn "Compose failed due to missing images for services: ${missing_list}"
+    local service_output
+    if ! service_output=$(_compose_list_services "$ds_dir" "$compose_cmd" "$compose_flags"); then
+      warn "Failed to list compose services after missing-image error (non-fatal)"
+    else
+      local -A missing_map=()
+      local -a filtered_services=()
+      local service
+      while IFS= read -r service; do
+        [[ -n "$service" ]] && missing_map["$service"]=1
+      done <<< "$missing_services"
+      while IFS= read -r service; do
+        [[ -z "$service" ]] && continue
+        [[ -n "${missing_map[$service]:-}" ]] && continue
+        filtered_services+=("$service")
+      done <<< "$service_output"
+
+      if [[ "${#filtered_services[@]}" -gt 0 ]]; then
+        if _compose_up_with_flags "$ds_dir" "$compose_cmd" "$compose_flags" "$compose_err" "--no-deps" "${filtered_services[@]}"; then
+          warn "PARTIAL BRING-UP: started ${#filtered_services[@]} services, skipped (missing images): ${missing_list}"
+          rm -f "$compose_err"
+          return 0
+        fi
+      fi
+    fi
+  fi
+
+  rm -f "$compose_err"
+  return 1
+}
+
+_heal_dashboard_api_proxy() {
+  local env_file="$1"
+  local dashboard_port dashboard_api_port dash_status api_status
+  dashboard_port=$(env_get "$env_file" "DASHBOARD_PORT")
+  dashboard_port="${dashboard_port:-3001}"
+  dashboard_api_port=$(env_get "$env_file" "DASHBOARD_API_PORT")
+  dashboard_api_port="${dashboard_api_port:-3002}"
+
+  dash_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist
+  api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist
+  [[ "$dash_status" != "running" || "$api_status" != "running" ]] && return 0
+
+  if curl -sf --max-time 3 "http://127.0.0.1:${dashboard_api_port}/health" >/dev/null 2>&1 \
+    && ! curl -sf --max-time 4 "http://127.0.0.1:${dashboard_port}/api/status" >/dev/null 2>&1; then
+    warn "Dashboard returned API 502 while dashboard-api is healthy — restarting dashboard to refresh upstream"
+    # [NON-FATAL: dashboard] Individual service failure does not block others.
+    docker restart dream-dashboard 2>>"$LOGFILE" || warn "dashboard restart failed (non-fatal)"
+  fi
+}
+
+# Start DreamServer services via compose
+start_services() {
+  local ds_dir="$1"
+  local gpu_backend="${2:-auto}"
+  local env_file="${ds_dir}/.env"
+  local compose_cmd
+  compose_cmd=$(get_compose_cmd)
+
+  cd "$ds_dir" || exit 1
+  [[ "$gpu_backend" == "auto" ]] && gpu_backend=$(detect_gpu_backend)
+  if [[ -z "${GPU_BACKEND:-}" ]]; then
+    GPU_BACKEND="$gpu_backend"
+  fi
+
+  # Last-resort .env permission guard (fatal if fails — compose cannot start without readable .env)
+  if [[ -f "$env_file" ]]; then
+    # Check and fix ownership independently
+    if [[ "$(stat -c '%U' "$env_file" 2>>"$LOGFILE" || echo root)" != "${DREAM_USER}" ]]; then
+      chown "${DREAM_USER}:${DREAM_USER}" "$env_file" || {
+        err ".env ownership fix failed in start_services — Docker Compose cannot start"
+        exit 1
+      }
+    fi
+    # Check and fix mode independently
+    if [[ "$(stat -c '%a' "$env_file" 2>>"$LOGFILE")" != "660" ]]; then
+      chmod 0660 "$env_file" || {
+        err ".env chmod to 0660 failed in start_services — Docker Compose cannot start"
+        exit 1
+      }
+    fi
+  fi
+
+  local gpu_overlay="docker-compose.${gpu_backend}.yml"
+  if [[ ! -f "$gpu_overlay" && "$gpu_backend" != "cpu" ]]; then
+    warn "GPU overlay ${gpu_overlay} not found — falling back to nvidia"
+    gpu_overlay="docker-compose.nvidia.yml"
+  fi
+
+  local compose_flags="-f docker-compose.base.yml"
+  if [[ "$gpu_backend" != "cpu" && -f "$gpu_overlay" ]]; then
+    compose_flags="${compose_flags} -f ${gpu_overlay}"
+  fi
+
+  # Prefer upstream compose stack resolver
+  if [[ -x "${ds_dir}/scripts/resolve-compose-stack.sh" ]]; then
+    log "Using DreamServer's resolve-compose-stack.sh"
+    local resolved_flags
+    resolved_flags=$(su - "$DREAM_USER" -c \
+      "cd ${ds_dir} && ./scripts/resolve-compose-stack.sh \
+        --gpu-backend ${gpu_backend} --gpu-count ${GPU_COUNT:-1}" 2>&1 || echo "")
+    if [[ -n "$resolved_flags" ]]; then
+      compose_flags="$resolved_flags"
+    fi
+  fi
+
+  _generate_p2p_gpu_overlay "$ds_dir"
+  if [[ -f "${ds_dir}/docker-compose.p2p-gpu.yml" ]]; then
+    compose_flags="${compose_flags} -f docker-compose.p2p-gpu.yml"
+  fi
+
+  _cleanup_stale_network
+  _apply_host_cpu_caps "$ds_dir" "$env_file" "" "$compose_flags"
+  expose_ports_for_vastai "$ds_dir"
+
+  if ! _compose_up_with_cpu_heal "$ds_dir" "$compose_cmd" "$compose_flags" "$env_file" "full compose"; then
+    warn "Full compose failed — trying core services only"
+    if ! _compose_up_with_cpu_heal "$ds_dir" "$compose_cmd" "$compose_flags" "$env_file" \
+      "core services" llama-server dashboard-api open-webui dashboard; then
+      warn "Core compose with llama failed — bringing up control plane only"
+      # [NON-FATAL: compose] Fallback failure still allows manual recovery.
+      _compose_up_with_cpu_heal "$ds_dir" "$compose_cmd" "$compose_flags" "$env_file" \
+        "control-plane services" dashboard-api dashboard open-webui \
+        || warn "control-plane compose up also failed (non-fatal)"
+    fi
+  fi
+
+  local normalized_ports
+  normalized_ports=$(_normalize_dashboard_api_port_envs "$env_file")
+  if [[ -n "$normalized_ports" ]]; then
+    log "Normalized commented port env values in .env: ${normalized_ports//$'\n'/, }"
+    # [NON-FATAL: dashboard] Individual service failure does not block others.
+    docker restart dream-dashboard-api 2>>"$LOGFILE" || warn "dashboard-api restart failed (non-fatal)"
+    # [NON-FATAL: dashboard] Individual service failure does not block others.
+    docker restart dream-dashboard 2>>"$LOGFILE" || warn "dashboard restart failed (non-fatal)"
+  fi
+
+  # If compose exited early, some containers may be left in Created state.
+  # Try to start them so users can still reach the control plane.
+  local created
+  created=$(docker ps -a --filter "status=created" --format '{{.Names}}' | grep '^dream-' || echo "")
+  if [[ -n "$created" ]]; then
+    warn "Some containers are still in Created state — attempting docker start"
+    while IFS= read -r cname; do
+      [[ -z "$cname" ]] && continue
+      # [NON-FATAL: service] Individual service failure does not block others.
+      docker start "$cname" >/dev/null 2>&1 || warn "start ${cname} failed (non-fatal)"
+    done <<< "$created"
+  fi
+
+  # Nudge dashboard if stuck in Created state
+  if docker ps -a --format '{{.Names}} {{.Status}}' 2>&1 | grep -q 'dream-dashboard Created'; then
+    # [NON-FATAL: dashboard] Individual service failure does not block others.
+    docker start dream-dashboard || warn "dashboard kick failed (non-fatal)"
+    log "Kicked dashboard out of Created state"
+  fi
+
+  _heal_dashboard_api_proxy "$env_file"
+  _ensure_host_agent_network_binding "$ds_dir"
+  # [NON-FATAL: host-agent] Agent availability only affects background downloads.
+  _ensure_host_agent_running "$ds_dir" || warn "Host agent unavailable - model downloads may fail until agent is started manually"
+  # [NON-FATAL: opencode] Optional service; failures do not block others.
+  _ensure_opencode_web_running "$ds_dir" || warn "OpenCode web unavailable (non-fatal)"
+}
diff --git a/dream-server/installers/p2p-gpu/phases/00-preflight.sh b/dream-server/installers/p2p-gpu/phases/00-preflight.sh
new file mode 100644
index 000000000..4df809593
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/00-preflight.sh
@@ -0,0 +1,252 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Phase 00: Preflight Checks
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/phases/
+# Purpose: GPU detection (NVIDIA/AMD/CPU), disk/Docker/DNS validation,
+#          nvidia-container-toolkit setup
+#
+# Expects: MIN_DISK_GB, MIN_VRAM_MB, LOGFILE, log(), warn(), err(),
+#          find_dream_dir(), get_compose_cmd(), detect_gpu()
+# Provides: GPU_BACKEND, GPU_NAME, GPU_VRAM, GPU_COUNT, CPU_COUNT,
+#           DISK_AVAIL_GB (all exported for later phases)
+#
+# Fixes covered: #12 (NVIDIA toolkit), #13 (disk space), #14 (compose v1),
+#                #17 (DNS), #27 (AMD GPU), #28 (CPU-only fallback)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 0/12: Preflight checks"
+
+TLS_OK="true"
+
+# Must be root
+if [[ $EUID -ne 0 ]]; then
+  err "This script must be run as root. Run: sudo bash ${SCRIPT_NAME}"
+  exit 1
+fi
+
+# ── [FIX: gpu-dedup] Use single detect_gpu() function ──────────────────────
+detect_gpu
+
+case "$GPU_BACKEND" in
+  nvidia) log "NVIDIA GPU: ${GPU_NAME} × ${GPU_COUNT} (${GPU_VRAM} MiB VRAM each)" ;;
+  amd)    log "AMD GPU: ${GPU_NAME} × ${GPU_COUNT} (${GPU_VRAM} MiB VRAM)" ;;
+  cpu)    warn "No GPU detected — running in CPU-only mode (slower but functional)" ;;
+esac
+
+# Multi-GPU enumeration
+if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then
+  enumerate_gpus
+  log "Multi-GPU: ${GPU_COUNT} GPUs, total VRAM: ${GPU_TOTAL_VRAM} MiB"
+  for i in "${!GPU_UUIDS[@]}"; do
+    log "  GPU[${i}]: ${GPU_NAMES[$i]} (${GPU_VRAMS[$i]} MiB) ${GPU_UUIDS[$i]}"
+  done
+fi
+
+CPU_COUNT=$(nproc)
+DISK_AVAIL_GB=$(df -BG --output=avail / 2>&1 | tail -1 | tr -dc '0-9')
+log "GPU backend: ${GPU_BACKEND} | CPUs: ${CPU_COUNT} | Disk: ${DISK_AVAIL_GB} GB"
+
+# VRAM check
+if [[ "$GPU_BACKEND" != "cpu" && "${GPU_VRAM:-0}" -lt "$MIN_VRAM_MB" ]]; then
+  warn "GPU VRAM (${GPU_VRAM} MiB) below recommended (${MIN_VRAM_MB} MiB) — small models only"
+fi
+
+# ── Disk space ──────────────────────────────────────────────────────────────
+_check_disk_space() {
+  local existing_install
+  existing_install=$(find_dream_dir 2>&1 || echo "")
+  if [[ "${DISK_AVAIL_GB:-0}" -lt "$MIN_DISK_GB" ]]; then
+    if [[ -n "$existing_install" && -f "${existing_install}/.env" ]]; then
+      warn "Disk (${DISK_AVAIL_GB} GB) below ${MIN_DISK_GB} GB, but DreamServer already installed"
+    else
+      err "Disk space (${DISK_AVAIL_GB} GB) below minimum (${MIN_DISK_GB} GB)."
+      err "DreamServer needs 40+ GB. Create a Vast.ai instance with more disk."
+      exit 1
+    fi
+  fi
+}
+_check_disk_space
+
+# ── Docker ──────────────────────────────────────────────────────────────────
+if ! command -v docker &>/dev/null; then
+  err "Docker not found. Use a Vast.ai image with Docker pre-installed."
+  exit 1
+fi
+
+COMPOSE_CMD=$(get_compose_cmd)
+compose_version="unknown"
+case "$COMPOSE_CMD" in
+  "docker compose")
+    compose_version=$(docker compose version --short 2>&1 || echo "unknown")
+    ;;
+  "docker-compose")
+    compose_version=$(docker-compose version --short 2>&1 || echo "unknown")
+    ;;
+esac
+log "Docker Compose: ${COMPOSE_CMD} (${compose_version})"
+
+# ── GPU passthrough verification ────────────────────────────────────────────
+_verify_nvidia_passthrough() {
+  local gpu_test_image="nvidia/cuda:12.4.1-base-ubuntu22.04"
+  local passthrough_timeout="${NVIDIA_DOCKER_TEST_TIMEOUT:-180}"
+  local probe_rc=0
+
+  log "Verifying NVIDIA Docker passthrough (timeout ${passthrough_timeout}s; first run may pull ${gpu_test_image})"
+  if timeout --signal=TERM "${passthrough_timeout}" \
+    docker run --rm --gpus all "${gpu_test_image}" nvidia-smi &>/dev/null; then
+    log "NVIDIA Docker passthrough verified"
+
+    # ── [FIX: nvml-mismatch] Detect and repair driver/library mismatch ────────
+    log "Checking for NVIDIA driver/library version misalignment..."
+    if detect_nvml_mismatch "${gpu_test_image}"; then
+      :
+    else
+      mismatch_status=$?
+      if [[ $mismatch_status -eq 1 ]]; then
+        warn "NVIDIA driver/library mismatch detected — attempting repair"
+        if ! repair_nvml_mismatch; then
+          warn "NVIDIA driver mismatch repair did not complete (non-fatal)"
+        fi
+      fi
+    fi
+
+    return 0
+  else
+    probe_rc=$?
+  fi
+
+  if [[ "$probe_rc" -eq 124 ]]; then
+    warn "NVIDIA GPU passthrough probe timed out after ${passthrough_timeout}s — checking toolkit..."
+  else
+    warn "NVIDIA GPU passthrough test failed (exit ${probe_rc}) — checking toolkit..."
+  fi
+
+  if [[ "$probe_rc" -ne 0 ]]; then
+    if ! dpkg -l nvidia-container-toolkit &>/dev/null; then
+      warn "nvidia-container-toolkit not installed — attempting install"
+
+      # [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout.
+      _wait_for_dpkg_lock 60 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle"
+
+      local keyring="/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg"
+      # [NON-FATAL: repo] Transient GPG/keyring failures should not halt install.
+      curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+        | gpg --dearmor --batch --yes --output "$keyring" 2>>"$LOGFILE" \
+        || warn "gpg key import failed (non-fatal)"
+      curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+        | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null
+      apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" update -qq 2>>"$LOGFILE" \
+        && apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" install -y -qq nvidia-container-toolkit 2>>"$LOGFILE"
+      # [NON-FATAL: nvidia-ctk] Toolkit may already be configured or unavailable.
+      nvidia-ctk runtime configure --runtime=docker 2>>"$LOGFILE" || warn "nvidia-ctk configure failed (non-fatal)"
+      # [NON-FATAL: docker] Docker may not be managed by systemctl on Vast.ai.
+      systemctl restart docker 2>>"$LOGFILE" || service docker restart 2>>"$LOGFILE" \
+        || warn "docker restart failed (non-fatal)"
+      log "nvidia-container-toolkit installed and configured"
+
+      # ── [FIX: nvml-mismatch] Re-check after toolkit install ──────────────
+      log "Re-checking for NVIDIA driver/library mismatch after toolkit install..."
+      if detect_nvml_mismatch "${gpu_test_image}"; then
+        :
+      else
+        mismatch_status=$?
+        if [[ $mismatch_status -eq 1 ]]; then
+          warn "NVIDIA driver/library mismatch detected — attempting repair"
+          if ! repair_nvml_mismatch; then
+            warn "NVIDIA driver mismatch repair did not complete (non-fatal)"
+          fi
+        fi
+      fi
+    fi
+  fi
+}
+
+_verify_amd_passthrough() {
+  [[ ! -e /dev/kfd ]] && warn "/dev/kfd not found — AMD GPU may not be container-accessible"
+  [[ ! -d /dev/dri ]] && warn "/dev/dri not found — AMD GPU rendering may not work"
+  if docker run --rm --device=/dev/kfd --device=/dev/dri rocm/rocm-terminal:latest rocm-smi &>/dev/null; then
+    log "AMD ROCm Docker passthrough verified"
+  else
+    warn "AMD ROCm Docker test failed — GPU may need driver configuration"
+  fi
+}
+
+[[ "$GPU_BACKEND" == "nvidia" ]] && _verify_nvidia_passthrough
+[[ "$GPU_BACKEND" == "amd" ]] && _verify_amd_passthrough
+
+# Re-detect GPU if initial detection returned cpu but nvidia-smi works now
+# (can happen after nvidia-container-toolkit install or stale state from previous run)
+if [[ "$GPU_BACKEND" == "cpu" ]] && command -v nvidia-smi &>/dev/null \
+  && nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null 2>&1; then
+  log "Re-running GPU detection after toolkit install..."
+  detect_gpu
+  if [[ "$GPU_BACKEND" != "cpu" ]]; then
+    log "GPU detected on retry: ${GPU_NAME} × ${GPU_COUNT} (${GPU_VRAM} MiB VRAM each)"
+  fi
+fi
+
+# ── DNS fix ─────────────────────────────────────────────────────────────────
+if ! host github.com &>/dev/null && ! nslookup github.com &>/dev/null; then
+  if ! curl -sf --max-time 5 https://github.com > /dev/null; then
+    warn "DNS resolution broken — adding Google DNS as fallback"
+    if ! grep -q '8.8.8.8' /etc/resolv.conf; then
+      echo "nameserver 8.8.8.8" >> /etc/resolv.conf
+      echo "nameserver 1.1.1.1" >> /etc/resolv.conf
+    fi
+  fi
+fi
+
+# ── HTTPS trust (proxy CA) ─────────────────────────────────────────────────
+_verify_https_trust() {
+  local urls=(
+    "https://huggingface.co"
+    "https://registry-1.docker.io/v2/"
+  )
+  local failed=false
+
+  if ! command -v curl &>/dev/null; then
+    warn "curl not found — skipping HTTPS trust check"
+    return 0
+  fi
+
+  for url in "${urls[@]}"; do
+    if curl -fsI --max-time 10 "$url" > /dev/null 2>>"$LOGFILE"; then
+      continue
+    fi
+    local rc=$?
+    if [[ "$rc" -eq 60 ]]; then
+      warn "HTTPS trust failure when contacting ${url} (curl exit 60)"
+      failed=true
+    else
+      warn "HTTPS check failed for ${url} (curl exit ${rc})"
+    fi
+  done
+
+  if [[ "$failed" == "true" ]]; then
+    TLS_OK="false"
+    warn "System TLS trust is broken — model downloads and Docker pulls will fail"
+    warn "If behind a proxy, install the proxy root CA, then run:"
+    warn "  cp /path/to/proxy-root.crt /usr/local/share/ca-certificates/proxy-root.crt"
+    warn "  update-ca-certificates --fresh"
+    warn "  systemctl restart docker"
+  fi
+}
+
+_verify_https_trust
+
+# ── /tmp permissions fix ────────────────────────────────────────────────────
+if [[ "$(stat -c '%a' /tmp)" != "1777" ]]; then
+  chown root:root /tmp
+  chmod 1777 /tmp
+  log "/tmp permissions fixed (was broken)"
+else
+  log "/tmp permissions OK"
+fi
+
+log "All preflight checks passed"
diff --git a/dream-server/installers/p2p-gpu/phases/01-dependencies.sh b/dream-server/installers/p2p-gpu/phases/01-dependencies.sh
new file mode 100644
index 000000000..5cda11a4c
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/01-dependencies.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 01: System Dependencies
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Install missing packages (git, curl, jq, aria2, acl, python3-yaml)
+#
+# Expects: LOGFILE, log()
+# Provides: All required CLI tools available in PATH
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 1/12: Installing system dependencies"
+
+pkgs_needed=()
+for pkg in sudo git curl jq wget openssl aria2 procps iproute2 acl python3-yaml; do
+  # python3-yaml is a library, check via python3 import
+  if [[ "$pkg" == "python3-yaml" ]]; then
+    python3 -c "import yaml" 2>&1 || pkgs_needed+=("$pkg")
+    continue
+  fi
+  command -v "$pkg" &>/dev/null || pkgs_needed+=("$pkg")
+done
+# ss is part of iproute2
+command -v ss &>/dev/null || pkgs_needed+=("iproute2")
+
+# Vast.ai instances often ship with stale PPAs (e.g. graphics-drivers) that
+# timeout during apt-get update and cause hard failures under set -e.
+# The GPU driver is already installed — these PPAs are not needed.
+for stale_ppa in graphics-drivers; do
+  if ls /etc/apt/sources.list.d/${stale_ppa}* &>/dev/null; then
+    rm -f /etc/apt/sources.list.d/${stale_ppa}*
+    log "Removed stale PPA: ${stale_ppa} (not needed — driver already installed)"
+  fi
+done
+
+# unattended-upgrades can hold the dpkg lock for minutes on fresh Vast.ai
+# instances. We rely on DPk::Lock::Timeout below, but if the lock is clearly
+# stuck, kill only unattended-upgrades (the typical culprit).
+# [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout.
+_wait_for_dpkg_lock 90 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle"
+
+# Disable unattended-upgrades permanently — it causes NVML mismatches
+# and dpkg lock contention on GPU instances
+if systemctl is-enabled unattended-upgrades &>/dev/null; then  # stderr expected: service check
+  # [NON-FATAL: systemd] Unattended-upgrades may not be managed on this host.
+  systemctl disable unattended-upgrades 2>>"$LOGFILE" || warn "Could not disable unattended-upgrades (non-fatal)"
+  # [NON-FATAL: systemd] Unattended-upgrades may not be managed on this host.
+  systemctl mask unattended-upgrades 2>>"$LOGFILE" || warn "Could not mask unattended-upgrades (non-fatal)"
+  log "Disabled unattended-upgrades (prevents NVIDIA driver/library mismatches)"
+fi
+
+if [[ ${#pkgs_needed[@]} -gt 0 ]]; then
+  # unattended-upgrades may briefly hold dpkg lock on fresh hosts.
+  apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-300}" update -qq 2>>"$LOGFILE"
+  apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-300}" install -y -qq "${pkgs_needed[@]}" 2>>"$LOGFILE"
+  log "Installed: ${pkgs_needed[*]}"
+else
+  log "All dependencies already present"
+fi
diff --git a/dream-server/installers/p2p-gpu/phases/02-user-setup.sh b/dream-server/installers/p2p-gpu/phases/02-user-setup.sh
new file mode 100644
index 000000000..63d7ff483
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/02-user-setup.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 02: User Setup
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Create dream user, configure sudo/docker group, copy SSH keys
+#
+# Expects: DREAM_USER, DREAM_HOME, log(), warn()
+# Provides: Non-root 'dream' user ready for DreamServer install
+#
+# Fixes covered: #01 (root user rejection), #02 (Docker socket denied)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 2/12: Creating user '${DREAM_USER}'"
+
+if id -u "$DREAM_USER" &>/dev/null; then
+  log "User '${DREAM_USER}' already exists"
+else
+  useradd -m -s /bin/bash -u 1000 "$DREAM_USER" 2>&1 || \
+    useradd -m -s /bin/bash "$DREAM_USER"
+  log "User '${DREAM_USER}' created"
+fi
+
+# Sudo access
+# [NON-FATAL: permissions] Sudo group add is convenience; install can proceed.
+usermod -aG sudo "$DREAM_USER" || warn "sudo group add failed (non-fatal)"
+echo "${DREAM_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-dream
+chmod 440 /etc/sudoers.d/90-dream
+
+# Docker group
+if getent group docker &>/dev/null; then
+  usermod -aG docker "$DREAM_USER"
+  log "Added ${DREAM_USER} to docker group"
+fi
+
+# Copy SSH keys for direct user access
+if [[ -d /root/.ssh && ! -d "${DREAM_HOME}/.ssh" ]]; then
+  cp -r /root/.ssh "${DREAM_HOME}/.ssh"
+  chown -R "${DREAM_USER}:${DREAM_USER}" "${DREAM_HOME}/.ssh"
+  chmod 700 "${DREAM_HOME}/.ssh"
+  find "${DREAM_HOME}/.ssh" -type f -exec chmod 600 {} +
+fi
+
+log "User configured"
diff --git a/dream-server/installers/p2p-gpu/phases/03-repository.sh b/dream-server/installers/p2p-gpu/phases/03-repository.sh
new file mode 100644
index 000000000..8633214ce
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/03-repository.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 03: Repository Setup
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Clone DreamServer repo or locate existing checkout
+#
+# Expects: DREAM_USER, DREAM_HOME, REPO_URL, REPO_BRANCH,
+#          log(), warn(), fix_ownership()
+# Provides: REPO_DIR (path to cloned repository)
+#
+# Fixes covered: #09 (dual directory confusion)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 3/12: Setting up DreamServer repository"
+
+REPO_DIR="${DREAM_HOME}/DreamServer"
+
+if [[ -d "${REPO_DIR}/.git" ]]; then
+  log "Repository already exists at ${REPO_DIR}"
+  su - "$DREAM_USER" -c "cd ${REPO_DIR} && git pull --ff-only" 2>&1 || \
+    warn "Could not pull latest (non-fatal — using existing checkout)"
+else
+  # Check alternate locations (some Vast.ai onstart scripts pre-clone)
+  found_repo=""
+  for candidate in /root/DreamServer /workspace/DreamServer /opt/DreamServer; do
+    if [[ -d "${candidate}/.git" ]]; then
+      found_repo="$candidate"
+      break
+    fi
+  done
+
+  if [[ -n "$found_repo" ]]; then
+    mv "$found_repo" "$REPO_DIR"
+    log "Moved repository from ${found_repo}"
+  else
+    su - "$DREAM_USER" -c "git clone --depth 1 --branch ${REPO_BRANCH} ${REPO_URL} ${REPO_DIR}"
+    log "Cloned DreamServer (shallow, branch: ${REPO_BRANCH})"
+  fi
+fi
+
+fix_ownership "$REPO_DIR" "$DREAM_USER"
diff --git a/dream-server/installers/p2p-gpu/phases/04-installer.sh b/dream-server/installers/p2p-gpu/phases/04-installer.sh
new file mode 100644
index 000000000..203186ceb
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/04-installer.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 04: Run Upstream Installer
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Execute DreamServer's install.sh with timeout protection
+#
+# Expects: REPO_DIR, DREAM_USER, INSTALLER_TIMEOUT, GPU_BACKEND, GPU_VRAM,
+#          GPU_COUNT, log(), warn(), err()
+# Provides: DreamServer installed (may be partial if timeout hit)
+#
+# Fixes covered: #25 (ComfyUI infinite hang), #26 (installer timeout)
+#
+# Modder notes:
+#   Timeout is non-fatal. Heavy services (ComfyUI, Whisper) download in
+#   background and are handled by later phases. We only cap the installer
+#   wait loop, not the actual containers.
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 4/12: Running DreamServer installer"
+
+warn "Running installer (${INSTALLER_TIMEOUT}s timeout)..."
+warn "Heavy services (ComfyUI, Whisper, etc.) will continue after timeout."
+
+install_exit=0
+installer_pid=""
+
+# Map detected VRAM to upstream installer tier system so non-interactive
+# installs on GPU hosts don't fall through to CPU-tier model selection.
+# Hard-fail philosophy: if GPU_BACKEND is nvidia but VRAM is unknown/zero,
+# we let the installer auto-detect rather than passing a wrong tier.
+installer_tier_arg=""
+if [[ "$GPU_BACKEND" == "nvidia" && "${GPU_VRAM:-0}" -gt 0 ]]; then
+  if   [[ "$GPU_VRAM" -ge 40000 ]]; then installer_tier_arg="--tier 4"
+  elif [[ "$GPU_VRAM" -ge 20000 ]]; then installer_tier_arg="--tier 3"
+  elif [[ "$GPU_VRAM" -ge 12000 ]]; then installer_tier_arg="--tier 2"
+  else                                   installer_tier_arg="--tier 1"
+  fi
+  log "Passing ${installer_tier_arg} to installer (GPU_VRAM=${GPU_VRAM} MiB)"
+fi
+
+# CDI containers can expose /dev/nvidia* without DRM vendor sysfs. Provide a
+# minimal sysfs override for the installer's detection phase when needed.
+drm_sys_override=""
+if [[ "$GPU_BACKEND" == "nvidia" && ( -e /dev/nvidiactl || -e /dev/nvidia0 ) ]]; then
+  has_drm_vendor=false
+  for vendor_path in /sys/class/drm/card*/device/vendor; do
+    if [[ -e "$vendor_path" ]]; then
+      has_drm_vendor=true
+      break
+    fi
+  done
+  if [[ "$has_drm_vendor" == "false" ]]; then
+    drm_sys_override="${TMPDIR:-/tmp}/dream-drm-sys"
+    mkdir -p "${drm_sys_override}/card0/device"
+    printf '0x10de\n' > "${drm_sys_override}/card0/device/vendor"
+    log "Providing DRM sysfs override at ${drm_sys_override} for containerized NVIDIA detection"
+  fi
+fi
+
+# sudo -E -u preserves GPU_BACKEND/GPU_VRAM/GPU_COUNT for the installer's
+# detection phase. The previous `su -` was a login shell and stripped them,
+# causing the installer to re-run its own (sysfs-based) detection which
+# fails on Vast.ai / RunPod / any CDI-based GPU container.
+sudo -E -u "$DREAM_USER" \
+  env HOME="${DREAM_HOME}" \
+    GPU_BACKEND="$GPU_BACKEND" \
+    GPU_VRAM="${GPU_VRAM:-0}" \
+    GPU_COUNT="${GPU_COUNT:-1}" \
+    GPU_NAME="${GPU_NAME:-unknown}" \
+    DREAM_DRM_SYS="${drm_sys_override:-}" \
+    bash -c "cd ${REPO_DIR} && ./install.sh --non-interactive ${installer_tier_arg}" &
+installer_pid=$!
+
+waited=0
+while kill -0 "$installer_pid" 2>/dev/null; do  # stderr expected: process may exit between checks
+  if [[ $waited -ge $INSTALLER_TIMEOUT ]]; then
+    warn "Installer reached ${INSTALLER_TIMEOUT}s limit — proceeding with setup"
+    # [NON-FATAL: cleanup] Installer may have exited before TERM.
+    kill -TERM "$installer_pid" 2>>"$LOGFILE" || warn "could not TERM installer (non-fatal)"
+    sleep 2
+    if kill -0 "$installer_pid" 2>>"$LOGFILE"; then
+      # [NON-FATAL: cleanup] Installer may have exited before KILL.
+      kill -9 "$installer_pid" 2>>"$LOGFILE" || warn "could not KILL installer (non-fatal)"
+    fi
+    # Child processes of the installer should die with their parent.
+    # No pkill -f needed — TERM/KILL on the parent suffices.
+    install_exit=124
+    break
+  fi
+  sleep 5
+  waited=$((waited + 5))
+  (( waited % 60 == 0 )) && log "Installer running... (${waited}s / ${INSTALLER_TIMEOUT}s max)"
+done
+
+if [[ $install_exit -ne 124 ]]; then
+  wait "$installer_pid" 2>>"$LOGFILE" || install_exit=$?
+fi
+
+if [[ $install_exit -eq 0 ]]; then
+  log "DreamServer installer completed successfully"
+elif [[ $install_exit -eq 124 ]]; then
+  log "Installer timed out (normal for heavy services) — continuing"
+else
+  warn "Installer exited with code ${install_exit} — applying fixes and continuing"
+fi
diff --git a/dream-server/installers/p2p-gpu/phases/05-post-install.sh b/dream-server/installers/p2p-gpu/phases/05-post-install.sh
new file mode 100644
index 000000000..4b39d6cbe
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/05-post-install.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 05: Post-Install Fixes
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Locate active dream-server directory, apply all post-install fixes
+#
+# Expects: DREAM_HOME, REPO_DIR, GPU_BACKEND, DREAM_USER,
+#          log(), warn(), err(), find_dream_dir(), fix_ownership(),
+#          apply_post_install_fixes()
+# Provides: DS_DIR (active dream-server path)
+#
+# Fixes covered: #03 (/tmp), #04 (CPU overflow), #05 (n8n uid), #06 (dashboard-api),
+#                #07 (comfyui write), #08 (WEBUI_SECRET), #15 (.env dupes)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 5/12: Locating directory & applying fixes"
+
+DS_DIR=$(find_dream_dir) || {
+  err "Could not find dream-server directory after install"
+  err "Expected at: ${DREAM_HOME}/dream-server or ${REPO_DIR}/dream-server"
+  exit 1
+}
+
+log "Active directory: ${DS_DIR}"
+fix_ownership "$DS_DIR" "$DREAM_USER"
+
+apply_post_install_fixes "$DS_DIR" "$GPU_BACKEND"
+
+# Fix secondary directory if dual-install occurred
+alt_dir=""
+if [[ "$DS_DIR" == "${DREAM_HOME}/dream-server" && -d "${REPO_DIR}/dream-server" ]]; then
+  alt_dir="${REPO_DIR}/dream-server"
+elif [[ "$DS_DIR" == "${REPO_DIR}/dream-server" && -d "${DREAM_HOME}/dream-server" ]]; then
+  alt_dir="${DREAM_HOME}/dream-server"
+fi
+
+if [[ -n "$alt_dir" && -f "${alt_dir}/.env" ]]; then
+  apply_post_install_fixes "$alt_dir" "$GPU_BACKEND"
+  log "Also fixed secondary directory: ${alt_dir}"
+fi
+
+# Cap llama-server context based on GPU VRAM budget
+_cap_context_for_vram "$DS_DIR"
+
+# -- Ensure data/persona/SOUL.md exists ------------------------------------
+# Hermes compose bind-mounts this file. If missing, Docker creates it as a
+# directory -> container crashes with "not a directory" error.
+_ensure_persona_file() {
+  local ds_dir="$1"
+  local persona_file="${ds_dir}/data/persona/SOUL.md"
+  local template="${ds_dir}/extensions/services/hermes/SOUL.md.template"
+
+  if [[ -f "$persona_file" ]]; then
+    return 0
+  fi
+
+  mkdir -p "${ds_dir}/data/persona"
+
+  # If Docker already created it as a directory, remove it
+  if [[ -d "$persona_file" ]]; then
+    log "Removing Docker-created directory at ${persona_file}"
+    # [NON-FATAL: cleanup] Best-effort cleanup; template fallback still works.
+    rm -rf "$persona_file" 2>>"$LOGFILE" || warn "Could not remove directory at ${persona_file} (non-fatal)"
+  fi
+
+  # Try rendering via upstream script first
+  local context_script="${ds_dir}/scripts/build-installation-context.py"
+  if [[ -x "$context_script" ]] && command -v python3 &>/dev/null; then
+    if su - "$DREAM_USER" -c "cd ${ds_dir} && python3 scripts/build-installation-context.py" \
+      >> "$LOGFILE" 2>&1; then
+      if [[ -f "$persona_file" ]]; then
+        log "Persona file rendered via build-installation-context.py"
+        return 0
+      fi
+    else
+      warn "build-installation-context.py failed (non-fatal) - using template"
+    fi
+  fi
+
+  # Fallback: copy template directly
+  if [[ -f "$template" ]]; then
+    cp "$template" "$persona_file"
+    chown "${DREAM_USER}:${DREAM_USER}" "$persona_file"
+    log "Persona file created from template at ${persona_file}"
+  else
+    # Last resort: create minimal placeholder so the mount does not fail
+    cat > "$persona_file" << 'SOUL_EOF'
+# DreamServer Persona
+You are Dream, a helpful AI assistant powered by DreamServer.
+SOUL_EOF
+    chown "${DREAM_USER}:${DREAM_USER}" "$persona_file"
+    log "Minimal persona placeholder created at ${persona_file}"
+  fi
+
+  # Final verification - if still not a regular file, something is wrong
+  if [[ ! -f "$persona_file" ]]; then
+    warn "SOUL.md is still not a regular file at ${persona_file} - hermes container will fail to mount"
+    warn "Manual fix: rm -rf ${persona_file} && cp ${template} ${persona_file}"
+  fi
+}
+
+_ensure_persona_file "$DS_DIR"
+
+# Ensure llama-server config mount points are regular files, not Docker-created directories
+_ensure_mount_files() {
+  local ds_dir="$1"
+  local models_ini="${ds_dir}/config/llama-server/models.ini"
+
+  # models.ini - llama-server bind mount
+  if [[ -d "$models_ini" ]]; then
+    log "Removing Docker-created directory at ${models_ini}"
+    rm -rf "$models_ini"
+  fi
+  if [[ ! -f "$models_ini" ]]; then
+    mkdir -p "${ds_dir}/config/llama-server"
+    touch "$models_ini"
+    chown "${DREAM_USER}:${DREAM_USER}" "$models_ini"
+    log "Created empty ${models_ini}"
+  fi
+}
+
+_ensure_mount_files "$DS_DIR"
diff --git a/dream-server/installers/p2p-gpu/phases/06-bootstrap-model.sh b/dream-server/installers/p2p-gpu/phases/06-bootstrap-model.sh
new file mode 100644
index 000000000..9007a1d0d
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/06-bootstrap-model.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Phase 06: Bootstrap Model
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/phases/
+# Purpose: Ensure a usable GGUF model file exists so llama-server can start.
+#          If the GPU can handle a bigger model, download it in the background
+#          and hot-swap once ready (zero downtime).
+#
+# Expects: DS_DIR, GPU_BACKEND, GPU_VRAM, GPU_COUNT,
+#          log(), warn(), err(), env_get(), env_set(),
+#          fix_known_uid_requirements(), apply_data_acl(),
+#          check_disk_for_download(), resolve_model_url(),
+#          resolve_tier_for_gpu(), _store_pid(), create_model_swap_watcher()
+# Provides: Verified GGUF_FILE in .env pointing to a real model;
+#           background download of tier model + swap watcher (if bootstrapped)
+#
+# Fixes covered: #19 (bootstrap model missing), #20 (llama-server hang)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 6/12: Ensuring bootstrap model is available"
+
+# Derive LLM_MODEL identifier from GGUF filename.
+# Strips .gguf extension and quantization suffix, lowercases.
+# Example: Qwen3-30B-A3B-Q4_K_M.gguf -> qwen3-30b-a3b
+_derive_llm_model() {
+  echo "$1" \
+    | sed -E 's/\.(gguf|GGUF)$//' \
+    | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' \
+    | tr '[:upper:]' '[:lower:]'
+}
+
+env_file="${DS_DIR}/.env"
+data_dir="${DS_DIR}/data"
+models_dir="${data_dir}/models"
+mkdir -p "$models_dir"
+
+model_ready=false
+
+# ── Step 1: Resolve the GPU-optimal tier model ────────────────────────────────
+# This is the model the GPU *should* run. We determine it from VRAM, not from
+# whatever the installer may or may not have written to .env.
+resolve_tier_for_gpu "$DS_DIR" "$GPU_BACKEND" "${GPU_VRAM:-0}" "${GPU_COUNT:-1}"
+tier_gguf="${TIER_GGUF_FILE}"
+tier_url="${TIER_GGUF_URL}"
+tier_size_mb="${TIER_MODEL_SIZE_MB}"
+
+# Persist model size for VRAM budget calculations in later phases
+if [[ "${TIER_MODEL_SIZE_MB:-0}" -gt 0 ]]; then
+  env_set "$env_file" "LLM_MODEL_SIZE_MB" "$TIER_MODEL_SIZE_MB"
+fi
+
+if [[ -n "$tier_gguf" ]]; then
+  log "GPU-optimal model for ${GPU_BACKEND} (${GPU_VRAM:-0}MB VRAM): ${tier_gguf} (~${tier_size_mb}MB)"
+else
+  warn "Could not determine tier model — will use bootstrap model only"
+fi
+
+# ── Step 2: Check if we already have a usable model ──────────────────────────
+
+# Check if the tier model itself is already downloaded
+if [[ -n "$tier_gguf" && -f "${models_dir}/${tier_gguf}" ]]; then
+  file_size=$(stat -c%s "${models_dir}/${tier_gguf}" || echo 0)
+  if [[ $file_size -gt 100000000 ]]; then
+    env_set "$env_file" "GGUF_FILE" "$tier_gguf"
+    env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$tier_gguf")"
+    model_ready=true
+    log "Tier model already present: ${tier_gguf} ($(( file_size / 1048576 )) MB)"
+  else
+    warn "Tier model exists but too small (${file_size} bytes) — likely corrupt"
+    rm -f "${models_dir}/${tier_gguf}"
+  fi
+fi
+
+# Check configured GGUF_FILE from .env
+if [[ "$model_ready" != "true" ]]; then
+  gguf_file=$(env_get "$env_file" "GGUF_FILE")
+  if [[ -n "$gguf_file" && -f "${models_dir}/${gguf_file}" ]]; then
+    file_size=$(stat -c%s "${models_dir}/${gguf_file}" || echo 0)
+    if [[ $file_size -gt 100000000 ]]; then
+      model_ready=true
+      log "Model verified: ${gguf_file} ($(( file_size / 1048576 )) MB)"
+      if [[ -z "$(env_get "$env_file" "LLM_MODEL")" ]]; then
+        env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$gguf_file")"
+      fi
+    else
+      warn "Model file exists but too small (${file_size} bytes) — likely corrupt"
+      rm -f "${models_dir}/${gguf_file}"
+    fi
+  fi
+fi
+
+# Check for ANY .gguf file as fallback
+if [[ "$model_ready" != "true" ]]; then
+  any_model=$(find "$models_dir" -name "*.gguf" -size +100M 2>>"$LOGFILE" | head -1 || echo "")
+  if [[ -n "$any_model" ]]; then
+    found_name=$(basename "$any_model")
+    env_set "$env_file" "GGUF_FILE" "$found_name"
+    env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$found_name")"
+    model_ready=true
+    log "Found existing model: ${found_name} — updated GGUF_FILE"
+  fi
+fi
+
+# ── Step 3: Download bootstrap model if nothing usable exists ─────────────────
+if [[ "$model_ready" != "true" ]]; then
+  # [FIX: disk-check] Verify disk space before downloading
+  if ! check_disk_for_download "$models_dir" 2; then
+    err "Cannot download bootstrap model — insufficient disk space"
+    warn "Continuing without a model — llama-server will not start"
+  else
+    if [[ "${TLS_OK:-true}" != "true" ]]; then
+      warn "Skipping bootstrap download because TLS trust is broken (TLS_OK=false)"
+      warn "Fix TLS trust (proxy root CA) and re-run setup to download models"
+    else
+      warn "No usable model found — downloading bootstrap model..."
+      bootstrap_url="https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
+      bootstrap_name="Qwen3-0.6B-Q4_K_M.gguf"
+
+      if command -v aria2c &>/dev/null; then
+        set +e
+        aria2c -x 8 -s 8 -k 5M --file-allocation=none --console-log-level=notice \
+          --check-integrity=true \
+          -d "$models_dir" -o "$bootstrap_name" "$bootstrap_url" 2>&1 | tail -5
+        dl_rc=${PIPESTATUS[0]}
+        set -e
+        if [[ "$dl_rc" -ne 0 ]]; then
+          warn "Bootstrap download failed (aria2c exit ${dl_rc}) — check TLS/proxy CA"
+        fi
+      else
+        set +e
+        curl -L --fail --progress-bar -o "${models_dir}/${bootstrap_name}" "$bootstrap_url"
+        dl_rc=$?
+        set -e
+        if [[ "$dl_rc" -ne 0 ]]; then
+          warn "Bootstrap download failed (curl exit ${dl_rc}) — check TLS/proxy CA"
+        fi
+      fi
+
+      # [FIX: bootstrap-size] Validate downloaded file size (>50MB for smallest GGUF)
+      if [[ -f "${models_dir}/${bootstrap_name}" ]]; then
+        dl_size=$(stat -c%s "${models_dir}/${bootstrap_name}" || echo 0)
+        if [[ "$dl_size" -gt 50000000 ]]; then
+          env_set "$env_file" "GGUF_FILE" "$bootstrap_name"
+          env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$bootstrap_name")"
+          model_ready=true
+          log "Bootstrap model downloaded: ${bootstrap_name} ($(( dl_size / 1048576 )) MB)"
+        else
+          err "Downloaded model too small (${dl_size} bytes) — likely incomplete or corrupt"
+          rm -f "${models_dir}/${bootstrap_name}"
+          warn "Continuing without a model — llama-server will not start"
+        fi
+      else
+        err "Failed to download bootstrap model — llama-server will not start"
+        warn "Continuing anyway — other services may still work"
+      fi
+    fi
+  fi
+fi
+
+# ── Step 4: Queue background download of tier model if needed ─────────────────
+# If we're running a smaller model than what the GPU can handle, download the
+# tier model in the background. The swap watcher will hot-swap GGUF_FILE and
+# recreate llama-server via `docker compose up -d` once the download completes.
+current_gguf=$(env_get "$env_file" "GGUF_FILE")
+if [[ "${TLS_OK:-true}" != "true" ]]; then
+  warn "Skipping tier model download because TLS trust is broken (TLS_OK=false)"
+elif [[ -n "$tier_gguf" && "$tier_gguf" != "${current_gguf:-}" ]]; then
+  # Determine disk space needed (model size in MB → GB, rounded up + 2GB buffer)
+  needed_gb=$(( (tier_size_mb / 1024) + 2 ))
+  [[ $needed_gb -lt 5 ]] && needed_gb=5
+
+  if check_disk_for_download "$models_dir" "$needed_gb"; then
+    # Resolve URL: prefer TIER_GGUF_URL from tier resolution, fallback to resolve_model_url
+    if [[ -z "$tier_url" ]]; then
+      tier_url=$(resolve_model_url "$DS_DIR" "$tier_gguf") || tier_url=""
+    fi
+
+    if [[ -n "$tier_url" ]]; then
+      log "Queuing background download: ${tier_gguf} (~${tier_size_mb}MB)"
+      log "  URL: ${tier_url}"
+      log "  Current model: ${current_gguf:-none}"
+      log "  Once complete, llama-server will auto-swap to the bigger model"
+      mkdir -p "${DS_DIR}/logs"
+
+      if command -v aria2c &>/dev/null; then
+        nohup aria2c \
+          -x 8 -s 8 -k 10M \
+          --continue=true \
+          --max-tries=0 \
+          --retry-wait=5 \
+          --timeout=60 \
+          --connect-timeout=30 \
+          --file-allocation=none \
+          --auto-file-renaming=false \
+          --console-log-level=warn \
+          --summary-interval=30 \
+          --check-integrity=true \
+          -d "$models_dir" \
+          -o "$tier_gguf" \
+          "$tier_url" \
+          >> "${DS_DIR}/logs/aria2c-download.log" 2>&1 &
+      else
+        nohup curl -L --fail -o "${models_dir}/${tier_gguf}" "$tier_url" \
+          >> "${DS_DIR}/logs/aria2c-download.log" 2>&1 &
+      fi
+
+      dl_pid=$!
+      _store_pid "aria2c-model" "$dl_pid"
+      log "Background download started (PID: ${dl_pid})"
+      create_model_swap_watcher "$DS_DIR" "$tier_gguf"
+    else
+      warn "Could not resolve download URL for ${tier_gguf} — staying on ${current_gguf:-bootstrap model}"
+    fi
+  else
+    warn "Insufficient disk for tier model (~${tier_size_mb}MB) — staying on ${current_gguf:-bootstrap model}"
+  fi
+elif [[ -n "$tier_gguf" && "$tier_gguf" == "${current_gguf:-}" ]]; then
+  log "Already running the GPU-optimal model: ${tier_gguf}"
+fi
+
+fix_known_uid_requirements "$data_dir" "$GPU_BACKEND"
+apply_data_acl "$models_dir"
+
+# Re-run VRAM context cap now that we know the actual model size
+_cap_context_for_vram "$DS_DIR"
diff --git a/dream-server/installers/p2p-gpu/phases/07-model-optimize.sh b/dream-server/installers/p2p-gpu/phases/07-model-optimize.sh
new file mode 100644
index 000000000..e4eba704c
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/07-model-optimize.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 07: Model Download Optimization
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Resume incomplete downloads with aria2c multi-threaded transfer,
+#          start model swap watcher
+#
+# Expects: DS_DIR, log(), optimize_model_download()
+# Provides: Background aria2c download + model swap watcher (if needed)
+#
+# Fixes covered: #11 (HuggingFace Xet throttle)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 7/12: Optimizing model downloads"
+
+optimize_model_download "$DS_DIR"
diff --git a/dream-server/installers/p2p-gpu/phases/08-vastai-quirks.sh b/dream-server/installers/p2p-gpu/phases/08-vastai-quirks.sh
new file mode 100644
index 000000000..138c75005
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/08-vastai-quirks.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 08: Vast.ai Quirks
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: No-systemd workaround, /dev/shm remount, OpenCode crash-loop fix
+#
+# Expects: DS_DIR, DREAM_USER, log(), warn()
+# Provides: Vast.ai-specific environment fixes applied
+#
+# Fixes covered: #18 (/dev/shm), #21 (no systemd), #22 (OpenCode crash-loop),
+#                #24 (/dev/shm too small)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 8/12: Applying Vast.ai-specific fixes"
+
+# ── No systemd ─────────────────────────────────────────────────────────────
+if ! command -v systemctl &>/dev/null && ! pidof systemd &>/dev/null; then
+  log "No systemd detected — Vast.ai environment confirmed"
+  dream_cli="${DS_DIR}/dream-cli"
+  if [[ -x "$dream_cli" ]]; then
+    # Start host agent early on no-systemd hosts so model downloads and dashboard
+    # operations are available before the compose stack fully settles.
+    # [NON-FATAL: host-agent] Agent start can be retried in later phases.
+    su - "$DREAM_USER" -c "cd ${DS_DIR} && DREAM_HOME=${DS_DIR} ./dream-cli agent start" 2>&1 || \
+      warn "Host agent start failed (non-fatal — will retry in phase 09)"
+  fi
+fi
+
+# ── OpenCode crash-loop disable ────────────────────────────────────────────
+if docker ps -a --format '{{.Names}} {{.Status}}' 2>&1 | grep -q 'dream-opencode.*Restarting'; then
+  warn "OpenCode is crash-looping — disabling to unblock other services"
+  dream_cli="${DS_DIR}/dream-cli"
+  if [[ -x "$dream_cli" ]]; then
+    # [NON-FATAL: opencode] Individual service failure does not block others.
+    su - "$DREAM_USER" -c "cd ${DS_DIR} && ./dream-cli disable opencode" 2>&1 \
+      || warn "dream-cli disable opencode failed (non-fatal)"
+  else
+    # [NON-FATAL: opencode] Individual service failure does not block others.
+    docker stop dream-opencode || warn "opencode stop failed (non-fatal)"
+    # [NON-FATAL: opencode] Individual service failure does not block others.
+    docker rm dream-opencode || warn "opencode rm failed (non-fatal)"
+  fi
+fi
+
+# ── Shared memory fix ─────────────────────────────────────────────────────
+shm_size_kb=$(df /dev/shm 2>&1 | awk 'NR==2{print $2}' || echo 0)
+if [[ "${shm_size_kb:-0}" -lt 1048576 ]]; then
+  shm_mb=$(( shm_size_kb / 1024 ))
+  warn "/dev/shm is only ${shm_mb} MB — GPU containers may be memory-starved"
+  # [NON-FATAL: perf] Remount is a performance optimization only.
+  mount -o remount,size=4G /dev/shm || warn "/dev/shm remount failed (non-fatal)"
+fi
+
+# ── Pre-pull Docker images ─────────────────────────────────────────────────
+prepull_docker_images "$DS_DIR"
+
+log "Vast.ai environment fixes applied"
diff --git a/dream-server/installers/p2p-gpu/phases/09-services.sh b/dream-server/installers/p2p-gpu/phases/09-services.sh
new file mode 100644
index 000000000..cbb944fc3
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/09-services.sh
@@ -0,0 +1,312 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 09: Services & Health Check
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Start all services, run health-check loop with llama-server
+#          diagnostics, report per-service status
+#
+# Expects: DS_DIR, GPU_BACKEND, LOGFILE, log(), warn(), err(),
+#          env_get(), env_set(), start_services(), discover_all_services()
+# Provides: Running DreamServer stack with status report
+#
+# Fixes covered: #10 (Dashboard stuck), #20 (llama-server hang),
+#                #23 (CUDA OOM), #25 (ComfyUI hang)
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 9/12: Starting services"
+
+# Verify the configured model file exists - llama-server will crash without it
+_verify_model_file() {
+  local ds_dir="$1"
+  local env_file="${ds_dir}/.env"
+  local gguf_file models_dir
+
+  gguf_file="$(env_get "$env_file" "GGUF_FILE")"
+  gguf_file="${gguf_file:-Qwen3.5-9B-Q4_K_M.gguf}"
+  models_dir="${ds_dir}/data/models"
+
+  if [[ -f "${models_dir}/${gguf_file}" ]]; then
+    log "Model file verified: ${gguf_file} ($(du -h "${models_dir}/${gguf_file}" | cut -f1))"
+    return 0
+  fi
+
+  warn "Model file ${gguf_file} not found in ${models_dir}"
+
+  # Check if any .gguf file exists as fallback
+  local fallback
+  fallback="$(find "$models_dir" -maxdepth 1 -name '*.gguf' -printf '%f\n' 2>/dev/null | head -1)"  # stderr expected: find probe
+  if [[ -n "$fallback" ]]; then
+    log "Found fallback model: ${fallback} - updating GGUF_FILE in .env"
+    env_set "$env_file" "GGUF_FILE" "$fallback"
+    return 0
+  fi
+
+  warn "No .gguf model files found - llama-server will be unhealthy"
+  warn "Download a model: wget -P ${models_dir} https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
+}
+
+# Verify that the model endpoint exposes at least one selectable model for Open WebUI.
+_verify_model_visibility() {
+  local env_file="${DS_DIR}/.env"
+  local ollama_port webui_port
+
+  ollama_port="$(env_get "$env_file" "OLLAMA_PORT")"
+  ollama_port="${ollama_port:-11434}"
+  webui_port="$(env_get "$env_file" "OPEN_WEBUI_PORT")"
+  webui_port="${webui_port:-3000}"
+
+  local model_count=0
+  local models_json
+  models_json="$(curl -sf --max-time 5 "http://127.0.0.1:${ollama_port}/v1/models" 2>/dev/null || echo "")"  # stderr expected: service may not be ready
+  if [[ -n "$models_json" ]]; then
+    model_count="$(echo "$models_json" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("data", [])))' 2>/dev/null || echo 0)"  # stderr expected: json parse may fail if the endpoint is not ready
+  fi
+
+  if [[ "$model_count" -gt 0 ]]; then
+    log "LLM model visible on API (${model_count} model(s) on port ${ollama_port})"
+  else
+    warn "No models visible on llama-server API (port ${ollama_port}) — Open WebUI may show 'Model not selected'"
+    warn "Check: curl http://127.0.0.1:${ollama_port}/v1/models"
+    warn "Open WebUI port ${webui_port} should refresh after the model API becomes available"
+  fi
+}
+
+# Multi-GPU: run topology detection and GPU-to-service assignment before startup
+_verify_model_file "$DS_DIR"
+
+if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then
+  run_gpu_assignment "$DS_DIR" "${DS_DIR}/.env"
+fi
+
+start_services "$DS_DIR"
+
+# ── Health-check loop with llama-server diagnostics ─────────────────────────
+_run_health_check() {
+  local env_file="${DS_DIR}/.env"
+  local models_dir="${DS_DIR}/data/models"
+  local max_wait=120 elapsed=0 llama_diagnosed=false
+
+  echo -n "  Waiting for services "
+  while [[ $elapsed -lt $max_wait ]]; do
+    local healthy running dash_api_status dashboard_status webui_status
+    healthy=$(docker ps --filter "health=healthy" --format '{{.Names}}' | wc -l)
+    running=$(docker ps --format '{{.Names}}' | wc -l)
+    dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist
+    dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist
+    webui_status=$(
+      docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null || # stderr expected: container may not exist
+      docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null || # stderr expected: container may not exist
+      echo "missing"
+    )
+    echo -n "."
+
+    if [[ $healthy -ge 3 && "$dash_api_status" == "running" \
+      && ( "$dashboard_status" == "running" || "$webui_status" == "running" ) ]]; then
+      echo ""
+      log "Core services healthy (${healthy}/${running} containers)"
+      return 0
+    fi
+
+    # Diagnose llama-server at 45s mark
+    if [[ $elapsed -ge 45 && "$llama_diagnosed" != "true" ]]; then
+      llama_diagnosed=true
+      _diagnose_llama "$env_file" "$models_dir"
+    fi
+
+    sleep 5
+    elapsed=$((elapsed + 5))
+  done
+
+  echo ""
+  warn "Health-check timeout (${max_wait}s) — some services may still be starting"
+}
+
+_diagnose_llama() {
+  local env_file="$1" models_dir="$2"
+  local llama_status
+  llama_status=$(docker inspect --format '{{.State.Status}}' dream-llama-server 2>&1 || echo "missing")
+
+  [[ "$llama_status" != "restarting" ]] && return 0
+
+  echo ""
+  warn "llama-server is crash-looping — diagnosing..."
+  local llama_logs
+  llama_logs=$(docker logs --tail 20 dream-llama-server 2>&1 || echo "")
+
+  if echo "$llama_logs" | grep -qi "CUDA out of memory\|out of memory\|OOM"; then
+    _handle_oom "$env_file" "$models_dir"
+  elif echo "$llama_logs" | grep -qi "No such file\|model file not found\|failed to load"; then
+    _handle_missing_model "$env_file" "$models_dir"
+  elif echo "$llama_logs" | grep -qi "address already in use\|bind failed"; then
+    err "Port conflict on llama-server port!"
+    warn "Check: ss -tlnp | grep :8080"
+  fi
+}
+
+_handle_oom() {
+  local env_file="$1" models_dir="$2"
+  err "Model too large for GPU VRAM!"
+  warn "Switching to smallest bootstrap model..."
+
+  local tiny_url="https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
+  local tiny_name="Qwen3-0.6B-Q4_K_M.gguf"
+  if [[ ! -f "${models_dir}/${tiny_name}" ]]; then
+    aria2c -x 8 -s 8 -d "$models_dir" -o "$tiny_name" "$tiny_url" 2>&1 || \
+      curl -sfL -o "${models_dir}/${tiny_name}" "$tiny_url"
+  fi
+  env_set "$env_file" "GGUF_FILE" "$tiny_name"
+  # [NON-FATAL: llama] Individual service failure does not block others.
+  docker restart dream-llama-server || warn "llama-server restart failed (non-fatal)"
+  echo -n "  Retrying with smaller model "
+}
+
+_handle_missing_model() {
+  local env_file="$1" models_dir="$2"
+  err "Model file not found by llama-server!"
+  local current_gguf
+  current_gguf=$(env_get "$env_file" "GGUF_FILE")
+  if [[ -n "$current_gguf" && ! -f "${models_dir}/${current_gguf}" ]]; then
+    warn "GGUF_FILE='${current_gguf}' does not exist in ${models_dir}/"
+    local fallback
+    fallback=$(find "$models_dir" -name "*.gguf" -size +50M 2>&1 | head -1 | xargs -r basename || echo "")
+    if [[ -n "$fallback" ]]; then
+      env_set "$env_file" "GGUF_FILE" "$fallback"
+      # [NON-FATAL: llama] Individual service failure does not block others.
+      docker restart dream-llama-server || warn "llama-server restart failed (non-fatal)"
+      warn "Switched to ${fallback}"
+    fi
+  fi
+}
+
+_run_health_check
+_verify_model_visibility
+
+# ── Service status report ──────────────────────────────────────────────────
+_report_service_status() {
+  echo ""
+  echo -e "${BOLD}Service Status:${NC}"
+  echo ""
+
+  local -a core_services=(
+    "llama-server|dream-llama-server"
+    "open-webui|dream-webui"
+    "dashboard|dream-dashboard"
+    "dashboard-api|dream-dashboard-api"
+  )
+  local -a heavy_services=()
+  local -a normal_services=()
+
+  while IFS='|' read -r sid _pe _pd _name _cat _proxy startup _cname; do
+    [[ -z "$sid" ]] && continue
+    case "$sid" in open-webui|dashboard|dashboard-api) continue ;; esac
+    local container_name="${_cname:-dream-${sid}}"
+    if [[ "$startup" == "heavy" ]]; then
+      heavy_services+=("${sid}|${container_name}")
+    else
+      normal_services+=("${sid}|${container_name}")
+    fi
+  done < <(discover_all_services "$DS_DIR")
+
+  _report_containers "${core_services[@]}"
+  _report_heavy "${heavy_services[@]}"
+  _report_normal "${normal_services[@]}"
+  _report_background_downloads
+
+  echo ""
+}
+
+_report_containers() {
+  for entry in "$@"; do
+    local svc container
+    IFS='|' read -r svc container <<< "$entry"
+    [[ -z "$container" ]] && container="dream-${svc}"
+
+    local status health
+    if ! status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null); then  # stderr expected: container may not exist
+      status="not found"
+    fi
+    if ! health=$(docker inspect --format '{{.State.Health.Status}}' "$container" 2>/dev/null); then  # stderr expected: container may not expose healthcheck
+      health="none"
+    fi
+
+    if [[ "$health" == "healthy" ]]; then
+      echo -e "  ${GREEN}✓${NC} ${svc}: healthy"
+    elif [[ "$status" == "running" ]]; then
+      echo -e "  ${YELLOW}◌${NC} ${svc}: starting up..."
+    elif [[ "$status" == "restarting" ]]; then
+      echo -e "  ${RED}↻${NC} ${svc}: restarting (check: docker logs ${container})"
+    elif [[ "$status" == "not found" ]]; then
+      echo -e "  ${DIM}·${NC} ${svc}: not deployed"
+    else
+      echo -e "  ${RED}✗${NC} ${svc}: ${status}"
+    fi
+  done
+}
+
+_report_heavy() {
+  for entry in "$@"; do
+    local svc container
+    IFS='|' read -r svc container <<< "$entry"
+    [[ -z "$container" ]] && container="dream-${svc}"
+
+    local status
+    if ! status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null); then  # stderr expected: container may not exist
+      status="not found"
+    fi
+    [[ "$status" == "not found" || "$status" == "exited" ]] && continue
+
+    local health
+    if ! health=$(docker inspect --format '{{.State.Health.Status}}' "$container" 2>/dev/null); then  # stderr expected: container may not expose healthcheck
+      health="none"
+    fi
+    if [[ "$health" == "healthy" ]]; then
+      echo -e "  ${GREEN}✓${NC} ${svc}: ready"
+    elif [[ "$status" == "running" ]]; then
+      echo -e "  ${CYAN}↓${NC} ${svc}: initializing in background"
+    elif [[ "$status" == "restarting" ]]; then
+      echo -e "  ${YELLOW}↻${NC} ${svc}: restarting (downloading models)"
+    fi
+  done
+}
+
+_report_normal() {
+  for entry in "$@"; do
+    local svc container
+    IFS='|' read -r svc container <<< "$entry"
+    [[ -z "$container" ]] && container="dream-${svc}"
+
+    local status
+    if ! status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null); then  # stderr expected: container may not exist
+      status="not found"
+    fi
+    [[ "$status" == "not found" || "$status" == "exited" ]] && continue
+
+    local health
+    if ! health=$(docker inspect --format '{{.State.Health.Status}}' "$container" 2>/dev/null); then  # stderr expected: container may not expose healthcheck
+      health="none"
+    fi
+    if [[ "$health" == "healthy" ]]; then
+      echo -e "  ${GREEN}✓${NC} ${svc}: healthy"
+    elif [[ "$status" == "running" ]]; then
+      echo -e "  ${YELLOW}◌${NC} ${svc}: starting up..."
+    fi
+  done
+}
+
+_report_background_downloads() {
+  if pgrep -f "aria2c.*gguf" > /dev/null 2>&1; then
+    echo -e "  ${CYAN}↓${NC} LLM model: upgrading in background (aria2c)"
+    echo "    Monitor: tail -f ${DS_DIR}/logs/aria2c-download.log"
+  fi
+  local bg_upgrade="${DS_DIR}/logs/model-upgrade.log"
+  if [[ -f "$bg_upgrade" ]] && pgrep -f "model-upgrade\|model.*download" > /dev/null 2>&1; then
+    echo -e "  ${CYAN}↓${NC} LLM model: upgrading in background (DreamServer)"
+  fi
+}
+
+_report_service_status
diff --git a/dream-server/installers/p2p-gpu/phases/10-voice-stack.sh b/dream-server/installers/p2p-gpu/phases/10-voice-stack.sh
new file mode 100644
index 000000000..7c8f82a92
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/10-voice-stack.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 10: Voice Stack
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Bootstrap Whisper ASR model + Kokoro TTS readiness gate
+#
+# Expects: DS_DIR, log(), ensure_whisper_asr_model(), ensure_tts_model_ready()
+# Provides: Voice services (STT/TTS) initialized with models
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 10/12: Verifying TTS/STT model availability"
+
+ensure_whisper_asr_model "$DS_DIR"
+ensure_tts_model_ready "$DS_DIR"
+
+_check_open_webui_health() {
+	local env_file="${DS_DIR}/.env"
+	local webui_port
+	webui_port="$(env_get "$env_file" "WEBUI_PORT")"
+	webui_port="${webui_port:-3000}"
+
+	if docker ps --format '{{.Names}}' | grep -qx 'dream-webui'; then
+		if ! wait_for_http "http://127.0.0.1:${webui_port}/health" 60 4; then
+			warn "Open WebUI not healthy yet — STT requests may return server connection errors"
+		fi
+	fi
+}
+
+_check_open_webui_health
diff --git a/dream-server/installers/p2p-gpu/phases/11-access-layer.sh b/dream-server/installers/p2p-gpu/phases/11-access-layer.sh
new file mode 100644
index 000000000..29dfffa3c
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/11-access-layer.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 11: Access Layer
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Cloudflare tunnel, SSH tunnel scripts, and access guidance
+#
+# Expects: DS_DIR, GPU_BACKEND, log(), warn(), setup_cloudflare_tunnel(),
+#          generate_ssh_tunnel_script(),
+#          generate_powershell_tunnel_script(),
+#          comfyui_preload_models()
+# Provides: All access methods configured for Vast.ai connectivity
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 11/12: Setting up access layer"
+
+# ComfyUI extra model downloads (if configured)
+comfyui_preload_models "$DS_DIR" "$GPU_BACKEND"
+
+# Prefer SSH tunnel mode for Vast.ai reliability and Windows compatibility.
+log "Using SSH tunnel mode for access (no public reverse-proxy URLs shown)"
+
+# Optional Cloudflare Tunnel
+setup_cloudflare_tunnel "$DS_DIR"
+
+# Auto-reconnecting SSH tunnel script
+generate_ssh_tunnel_script "$DS_DIR"
+generate_powershell_tunnel_script "$DS_DIR"
diff --git a/dream-server/installers/p2p-gpu/phases/12-summary.sh b/dream-server/installers/p2p-gpu/phases/12-summary.sh
new file mode 100644
index 000000000..5e95bf845
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/phases/12-summary.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Phase 12: Summary
+# ============================================================================
+# Part of: p2p-gpu/phases/
+# Purpose: Print access info, connection methods, final success message
+#
+# Expects: DS_DIR, LOGFILE, log(), print_access_info(), _ts()
+# Provides: User-facing summary of all access methods
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+step "Phase 12/12: Setup complete"
+
+print_access_info "$DS_DIR"
+
+# [NON-FATAL: logging] Summary logging should not block completion.
+echo "=== Setup completed at $(_ts) ===" >> "$LOGFILE" || warn "logfile write failed (non-fatal)"
+log "Setup complete! Core services ready. Heavy services downloading in background."
diff --git a/dream-server/installers/p2p-gpu/setup.sh b/dream-server/installers/p2p-gpu/setup.sh
new file mode 100755
index 000000000..56d16dd77
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/setup.sh
@@ -0,0 +1,193 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Deploy Orchestrator
+# ============================================================================
+# Deploy DreamServer on peer-to-peer GPU marketplaces (Vast.ai)
+#
+# Target:  Remote GPU instance (NVIDIA, AMD, or CPU-only)
+# OS:      Ubuntu 22.04 / 24.04
+# License: Apache-2.0 (same as DreamServer)
+#
+# Usage:
+#   bash setup.sh              # Full install
+#   bash setup.sh --resume     # Quick restart (re-apply fixes + start)
+#   bash setup.sh --status     # Health check
+#   bash setup.sh --info       # Show connection URLs
+#   bash setup.sh --fix        # Apply fixes + restart (no reinstall)
+#   bash setup.sh --teardown   # Stop all services
+#
+# This file sources library modules (pure functions) then runs each install
+# phase in order. Modules live under:
+#   lib/           — reusable function libraries
+#   phases/        — sequential install steps (execute on source)
+#   subcommands/   — alternative entry points (--teardown, --status, etc.)
+#
+# Design: adapted from DreamServer CLAUDE.md for provider environments
+#   Let It Crash > KISS > Pure Functions > SOLID
+#   set -euo pipefail everywhere. Non-fatal paths use || warn (per
+#   CLAUDE.md §4) because on rented hardware, partial stack > dead stack.
+# ============================================================================
+
+set -euo pipefail
+IFS=$'\n\t'
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "$0")"
+DRY_RUN=false
+
+# ── Source libraries ────────────────────────────────────────────────────────
+source "${SCRIPT_DIR}/lib/constants.sh"
+source "${SCRIPT_DIR}/lib/logging.sh"
+source "${SCRIPT_DIR}/lib/environment.sh"
+source "${SCRIPT_DIR}/lib/permissions.sh"
+source "${SCRIPT_DIR}/lib/services.sh"
+source "${SCRIPT_DIR}/lib/networking.sh"
+source "${SCRIPT_DIR}/lib/models.sh"
+source "${SCRIPT_DIR}/lib/gpu-topology.sh"
+source "${SCRIPT_DIR}/lib/compatibility.sh"
+
+# ── Source subcommands ──────────────────────────────────────────────────────
+source "${SCRIPT_DIR}/subcommands/teardown.sh"
+source "${SCRIPT_DIR}/subcommands/status.sh"
+source "${SCRIPT_DIR}/subcommands/resume.sh"
+source "${SCRIPT_DIR}/subcommands/fix.sh"
+source "${SCRIPT_DIR}/subcommands/info.sh"
+
+# ── Subcommand routing ─────────────────────────────────────────────────────
+_route_subcommand() {
+  case "${1:-}" in
+    --teardown|teardown)  cmd_teardown; exit 0 ;;
+    --status|status)      cmd_status;   exit 0 ;;
+    --resume|resume)      cmd_resume;   exit 0 ;;
+    --fix|fix)            cmd_fix;      exit 0 ;;
+    --info|info)          cmd_info;     exit 0 ;;
+    --dry-run)            DRY_RUN=true ;;
+    --version)            echo "dreamserver-vastai-setup v${VASTAI_VERSION}"; exit 0 ;;
+    --help|-h)            _print_help; exit 0 ;;
+    --*)                  err "Unknown option: ${1}"; echo "Run 'bash ${SCRIPT_NAME} --help'"; exit 1 ;;
+  esac
+}
+
+_print_help() {
+  echo ""
+  echo -e "${BOLD}DreamServer — Vast.ai Setup v${VASTAI_VERSION}${NC}"
+  echo ""
+  echo -e "${BOLD}Usage:${NC} bash ${SCRIPT_NAME} [COMMAND]"
+  echo ""
+  echo -e "${BOLD}Commands:${NC}"
+  echo "  (no args)     Full install (first time) or re-install"
+  echo "  --resume      Quick restart — re-apply fixes and start services"
+  echo "  --status      Health check — show GPU, containers, ports"
+  echo "  --info        Show connection URLs and SSH tunnel commands"
+  echo "  --fix         Apply latest fixes without full re-install"
+  echo "  --teardown    Stop all services"
+  echo "  --dry-run     Preview what would happen without making changes"
+  echo "  --help        Show this help"
+  echo ""
+  echo -e "${BOLD}Common scenarios:${NC}"
+  echo "  First time:         bash ${SCRIPT_NAME}"
+  echo "  SSH dropped:        bash ${SCRIPT_NAME} --resume"
+  echo "  Services broken:    bash ${SCRIPT_NAME} --fix"
+  echo "  Check status:       bash ${SCRIPT_NAME} --status"
+  echo "  Done for the day:   bash ${SCRIPT_NAME} --teardown"
+  echo ""
+}
+
+# ── Smart re-run detection ──────────────────────────────────────────────────
+_check_existing_install() {
+  local existing_dir
+  existing_dir=$(find_dream_dir 2>&1 || echo "")
+  if [[ -n "$existing_dir" && -f "${existing_dir}/.env" ]]; then
+    local running_count
+    running_count=$(docker ps --format '{{.Names}}' 2>&1 | grep -c '^dream-' || echo 0)
+    if [[ "$running_count" -gt 0 ]]; then
+      echo ""
+      echo -e "${YELLOW}${BOLD}  DreamServer is already installed (${running_count} services running).${NC}"
+      echo ""
+      echo -e "  You probably want:"
+      echo -e "    ${BOLD}bash ${SCRIPT_NAME} --resume${NC}   → Quick restart + fixes"
+      echo -e "    ${BOLD}bash ${SCRIPT_NAME} --fix${NC}      → Apply fixes only"
+      echo -e "    ${BOLD}bash ${SCRIPT_NAME} --status${NC}   → Check health"
+      echo ""
+      echo -n -e "  Continue with full re-install? [y/N] "
+      local answer
+      read -r -t 15 answer || answer="n"
+      if [[ "${answer,,}" != "y" && "${answer,,}" != "yes" ]]; then
+        log "Aborted. Use --resume, --fix, --status, or --info."
+        exit 0
+      fi
+      echo ""
+    fi
+  fi
+}
+
+# ── Main install flow ──────────────────────────────────────────────────────
+main() {
+  _route_subcommand "${1:-}"
+
+  # ── Full install ──────────────────────────────────────────────────────
+  echo ""
+  echo -e "${CYAN}${BOLD}  DreamServer — Vast.ai Setup v${VASTAI_VERSION}${NC}"
+  echo -e "${DIM}  https://github.com/Light-Heart-Labs/DreamServer${NC}"
+  echo ""
+
+  setup_cleanup_trap
+  acquire_lock
+  mkdir -p "$(dirname "$LOGFILE")"
+  # [NON-FATAL: logging] Setup can proceed even if the logfile is unwritable.
+  echo "=== Setup started at $(_ts) ===" >> "$LOGFILE" || warn "logfile write failed (non-fatal)"
+
+  _check_existing_install
+
+  # ── Dry-run mode: preview without executing ────────────────────────
+  if [[ "$DRY_RUN" == "true" ]]; then
+    echo ""
+    echo -e "${BOLD}Dry-run mode — no changes will be made.${NC}"
+    echo ""
+    echo "This setup would:"
+    echo "  1.  Detect GPU and validate system requirements"
+    echo "  2.  Install dependencies (sudo, git, curl, jq, aria2, etc.)"
+    echo "  3.  Create 'dream' user with Docker access"
+    echo "  4.  Clone DreamServer from ${REPO_URL:-Light-Heart-Labs/DreamServer}"
+    echo "  5.  Run DreamServer installer (non-interactive, 600s timeout)"
+    echo "  6.  Apply post-install fixes (permissions, env defaults)"
+    echo "  7.  Download/verify GGUF model for llama-server"
+    echo "  8.  Apply Vast.ai-specific quirks (/dev/shm, no-systemd)"
+    echo "  9.  Start Docker Compose services + health check"
+    echo "  10. Bootstrap voice stack (Whisper + Kokoro TTS)"
+    echo "  11. Set up reverse proxy (Caddy) + access tunnels"
+    echo "  12. Print connection info and SSH tunnel commands"
+    echo ""
+    echo -e "${BOLD}System:${NC}"
+    detect_gpu
+    echo "  GPU:    ${GPU_NAME} (${GPU_BACKEND}, ${GPU_VRAM} MB VRAM)"
+    echo "  CPU:    $(nproc) cores"
+    echo "  Disk:   $(df -BG --output=avail . 2>>"$LOGFILE" | tail -1 | tr -dc '0-9')GB available"
+    echo "  Docker: $(docker --version 2>>"$LOGFILE" || echo 'not installed')"
+    echo ""
+    echo "Run without --dry-run to proceed."
+    exit 0
+  fi
+
+  # Shared state variables (set by phases, used across phases)
+  GPU_BACKEND="" GPU_NAME="" GPU_VRAM="" GPU_COUNT=0
+  CPU_COUNT=0 DISK_AVAIL_GB=0 COMPOSE_CMD=""
+  REPO_DIR="" DS_DIR=""
+
+  # ── Execute phases in order ───────────────────────────────────────────
+  source "${SCRIPT_DIR}/phases/00-preflight.sh"
+  source "${SCRIPT_DIR}/phases/01-dependencies.sh"
+  source "${SCRIPT_DIR}/phases/02-user-setup.sh"
+  source "${SCRIPT_DIR}/phases/03-repository.sh"
+  source "${SCRIPT_DIR}/phases/04-installer.sh"
+  source "${SCRIPT_DIR}/phases/05-post-install.sh"
+  source "${SCRIPT_DIR}/phases/06-bootstrap-model.sh"
+  source "${SCRIPT_DIR}/phases/07-model-optimize.sh"
+  source "${SCRIPT_DIR}/phases/08-vastai-quirks.sh"
+  source "${SCRIPT_DIR}/phases/09-services.sh"
+  source "${SCRIPT_DIR}/phases/10-voice-stack.sh"
+  source "${SCRIPT_DIR}/phases/11-access-layer.sh"
+  source "${SCRIPT_DIR}/phases/12-summary.sh"
+}
+
+main "$@"
diff --git a/dream-server/installers/p2p-gpu/subcommands/fix.sh b/dream-server/installers/p2p-gpu/subcommands/fix.sh
new file mode 100644
index 000000000..a522eee56
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/subcommands/fix.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Subcommand: fix
+# ============================================================================
+# Part of: p2p-gpu/subcommands/
+# Purpose: Apply fixes without full reinstall (port rebind, network fix,
+#          CPU cap, permissions, service restart)
+#
+# Expects: log(), warn(), err(), find_dream_dir(), detect_gpu_backend(),
+#          expose_ports_for_vastai(), apply_post_install_fixes(),
+#          start_services(), ensure_whisper_asr_model(), ensure_tts_model_ready(),
+#          generate_ssh_tunnel_script(),
+#          generate_powershell_tunnel_script(), print_access_info(),
+#          get_compose_cmd()
+# Provides: All runtime fixes applied and services restarted
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+  cmd_fix() {
+  step "Applying fixes (no reinstall)"
+  local ds_dir
+  ds_dir=$(find_dream_dir) || { err "DreamServer directory not found. Run full install first."; exit 1; }
+
+  cd "$ds_dir" || exit 1
+  detect_gpu
+  local gpu_backend="$GPU_BACKEND"
+
+  expose_ports_for_vastai "$ds_dir"
+
+  # Fix stale Docker network
+  if docker network inspect dream-network >/dev/null 2>&1; then
+    local net_label
+    net_label=$(docker network inspect dream-network \
+      --format '{{index .Labels "com.docker.compose.network"}}' 2>&1 || echo "")
+    if [[ -z "$net_label" ]]; then
+      log "Fixing stale dream-network..."
+      local compose_cmd
+      compose_cmd=$(get_compose_cmd)
+      if [[ "$compose_cmd" == "docker compose" ]]; then
+        # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+        docker compose down 2>&1 || warn "compose down failed (non-fatal)"
+      else
+        # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+        docker-compose down 2>&1 || warn "compose down failed (non-fatal)"
+      fi
+      for cid in $(docker network inspect dream-network \
+        -f '{{range .Containers}}{{.Name}} {{end}}' 2>&1 || echo ""); do
+        # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+        docker network disconnect -f dream-network "$cid" || warn "disconnect ${cid} failed (non-fatal)"
+      done
+      # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+      docker network rm dream-network || warn "network rm failed (non-fatal)"
+      log "Stale network removed — compose will recreate on next start"
+    fi
+  fi
+
+  apply_post_install_fixes "$ds_dir" "$gpu_backend"
+  if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then
+    enumerate_gpus
+    run_gpu_assignment "$ds_dir" "${ds_dir}/.env"
+  fi
+
+  log "Fixes applied. Restarting services..."
+  start_services "$ds_dir"
+  ensure_whisper_asr_model "$ds_dir"
+  ensure_tts_model_ready "$ds_dir"
+
+  generate_ssh_tunnel_script "$ds_dir"
+  generate_powershell_tunnel_script "$ds_dir"
+
+  print_access_info "$ds_dir"
+  log "Fix complete!"
+}
diff --git a/dream-server/installers/p2p-gpu/subcommands/info.sh b/dream-server/installers/p2p-gpu/subcommands/info.sh
new file mode 100644
index 000000000..e15f00d50
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/subcommands/info.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Subcommand: info
+# ============================================================================
+# Part of: p2p-gpu/subcommands/
+# Purpose: Print connection details only (no modifications)
+#
+# Expects: err(), find_dream_dir(), print_access_info()
+# Provides: Display of all access methods and URLs
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+cmd_info() {
+  local ds_dir
+  ds_dir=$(find_dream_dir) || { err "DreamServer directory not found. Run full install first."; exit 1; }
+  print_access_info "$ds_dir"
+}
diff --git a/dream-server/installers/p2p-gpu/subcommands/resume.sh b/dream-server/installers/p2p-gpu/subcommands/resume.sh
new file mode 100644
index 000000000..d34573154
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/subcommands/resume.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Subcommand: resume
+# ============================================================================
+# Part of: p2p-gpu/subcommands/
+# Purpose: Quick restart — re-apply fixes and start services
+#
+# Expects: log(), warn(), err(), find_dream_dir(), detect_gpu_backend(),
+#          apply_post_install_fixes(), start_services(),
+#          ensure_whisper_asr_model(), ensure_tts_model_ready(),
+#          generate_ssh_tunnel_script(), generate_powershell_tunnel_script(),
+#          print_access_info()
+# Provides: Running DreamServer with latest fixes applied
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+cmd_resume() {
+  step "Resuming DreamServer"
+  local ds_dir
+  ds_dir=$(find_dream_dir) || { err "DreamServer directory not found"; exit 1; }
+
+  cd "$ds_dir" || exit 1
+  detect_gpu
+  local gpu_backend="$GPU_BACKEND"
+
+  apply_post_install_fixes "$ds_dir" "$gpu_backend"
+  if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then
+    enumerate_gpus
+    run_gpu_assignment "$ds_dir" "${ds_dir}/.env"
+  fi
+  start_services "$ds_dir"
+  print_access_info "$ds_dir"
+
+  # Keep the remaining resume steps after the access summary so a later
+  # optional failure does not hide the URLs and commands from the terminal.
+  ensure_whisper_asr_model "$ds_dir"
+  ensure_tts_model_ready "$ds_dir"
+  generate_ssh_tunnel_script "$ds_dir"
+  generate_powershell_tunnel_script "$ds_dir"
+}
diff --git a/dream-server/installers/p2p-gpu/subcommands/status.sh b/dream-server/installers/p2p-gpu/subcommands/status.sh
new file mode 100644
index 000000000..7a808328b
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/subcommands/status.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — Vast.ai Subcommand: status
+# ============================================================================
+# Part of: p2p-gpu/subcommands/
+# Purpose: Display GPU info, container status, download progress
+#
+# Expects: log(), warn(), err(), find_dream_dir()
+# Provides: Health status overview
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+cmd_status() {
+  local ds_dir
+  ds_dir=$(find_dream_dir) || { err "DreamServer directory not found"; exit 1; }
+
+  echo -e "\n${BOLD}DreamServer Status${NC}\n"
+
+  # GPU info
+  local gpu_backend
+  gpu_backend=$(detect_gpu_backend)
+  case "$gpu_backend" in
+    nvidia)
+      if nvidia-smi --query-gpu=name,memory.total,memory.used,utilization.gpu \
+        --format=csv,noheader 2>>"$LOGFILE" | while IFS=',' read -r name mem_total mem_used util; do
+        echo -e "  GPU: ${CYAN}${name}${NC} | VRAM: ${mem_used} /${mem_total} | Util: ${util}"
+      done; then
+        :
+      else
+        warn "NVIDIA backend detected but nvidia-smi query failed"
+      fi
+      ;;
+    amd)
+      if command -v rocm-smi >/dev/null 2>&1; then
+        local amd_name amd_vram
+        amd_name=$(rocm-smi --showproductname 2>>"$LOGFILE" | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU")
+        amd_vram=$(rocm-smi --showmeminfo vram 2>>"$LOGFILE" | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0")
+        if [[ "${amd_vram:-0}" -gt 1000000 ]]; then
+          amd_vram=$(( amd_vram / 1048576 ))
+        fi
+        echo -e "  GPU: ${CYAN}${amd_name}${NC} | VRAM: ${amd_vram} MiB"
+      else
+        warn "AMD backend detected but rocm-smi is not available"
+      fi
+      ;;
+    *)
+      echo "  GPU: CPU-only mode (no accelerator detected)"
+      ;;
+  esac
+
+  echo ""
+  docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" 2>&1 | head -20
+
+  echo ""
+  local healthy running total
+  healthy=$(docker ps --filter "health=healthy" --format '{{.Names}}' | wc -l)
+  running=$(docker ps --format '{{.Names}}' | wc -l)
+  total=$(docker ps -a --format '{{.Names}}' | grep -c '^dream-' || echo 0)
+  echo -e "  Containers: ${GREEN}${healthy}${NC} healthy / ${running} running / ${total} total"
+
+  if pgrep -f "aria2c.*gguf" > /dev/null 2>&1; then
+    echo -e "  Model download: ${YELLOW}in progress${NC}"
+    local dl_log="${ds_dir}/logs/aria2c-download.log"
+    [[ -f "$dl_log" ]] && tail -1 "$dl_log" 2>&1 | sed 's/^/    /'
+  fi
+  echo ""
+}
diff --git a/dream-server/installers/p2p-gpu/subcommands/teardown.sh b/dream-server/installers/p2p-gpu/subcommands/teardown.sh
new file mode 100644
index 000000000..688ca0e99
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/subcommands/teardown.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# ============================================================================
+# DreamServer — P2P GPU Subcommand: teardown
+# ============================================================================
+# Part of: dream-server/installers/p2p-gpu/subcommands/
+# Purpose: Stop all containers and background processes to stop all services
+#
+# Expects: log(), warn(), err(), find_dream_dir(), get_compose_cmd(),
+#          _kill_stored_pid(), PIDFILE_DIR, SCRIPT_NAME
+# Provides: Clean shutdown of all DreamServer services
+#
+# SPDX-License-Identifier: Apache-2.0
+# ============================================================================
+
+set -euo pipefail
+
+cmd_teardown() {
+  step "Teardown — stopping all services"
+  local ds_dir
+  ds_dir=$(find_dream_dir) || { err "DreamServer directory not found"; exit 1; }
+
+  cd "$ds_dir" || exit 1
+
+  if [[ -f "docker-compose.base.yml" ]]; then
+    local compose_cmd
+    compose_cmd=$(get_compose_cmd)
+    if [[ "$compose_cmd" == "docker compose" ]]; then
+      # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+      docker compose down --remove-orphans 2>&1 || warn "Compose down had warnings (non-fatal)"
+    else
+      # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none.
+      docker-compose down --remove-orphans 2>&1 || warn "Compose down had warnings (non-fatal)"
+    fi
+  fi
+
+  # [FIX: pkill] Use PID-file based cleanup instead of pkill -f
+  _kill_stored_pid "aria2c-model"
+  _kill_stored_pid "model-swap-watcher"
+  _kill_stored_pid "cloudflared"
+
+  log "All services stopped. Storage billing continues."
+  log "To fully stop billing: delete the instance from the provider console."
+  echo ""
+  echo -e "${BOLD}Data preserved at:${NC} ${ds_dir}/data/"
+  echo -e "${BOLD}To resume:${NC} bash ${SCRIPT_NAME} --resume"
+}
diff --git a/dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh b/dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh
new file mode 100644
index 000000000..ae1e48b64
--- /dev/null
+++ b/dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# Regression: ensure NVML mismatch repair path is reachable under set -e.
+set -euo pipefail
+
+P2P_GPU_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+LOGFILE="$(mktemp -t p2p-gpu-nvml.XXXXXX)"
+STUB_DIR="$(mktemp -d -t p2p-gpu-stub.XXXXXX)"
+APT_CALLED_FILE="${STUB_DIR}/apt-called"
+trap 'rm -f "$LOGFILE"; rm -rf "$STUB_DIR"' EXIT
+
+# Minimal logging functions expected by environment.sh
+log() { :; }
+warn() { :; }
+err() { :; }
+step() { :; }
+
+assert_no_apt_call() {
+  if [[ -e "$APT_CALLED_FILE" ]]; then
+    echo "Expected repair path to skip apt-get" >&2
+    exit 1
+  fi
+}
+
+# shellcheck source=../lib/environment.sh
+source "${P2P_GPU_DIR}/lib/environment.sh"
+
+# Force mismatch status to validate repair path.
+detect_nvml_mismatch() {
+  return 1
+}
+
+export PATH="${STUB_DIR}:${PATH}"
+export APT_CALLED_FILE
+
+cat >"${STUB_DIR}/apt-get" <<'EOF'
+#!/usr/bin/env bash
+set -euo pipefail
+echo "called" >> "${APT_CALLED_FILE}"
+exit 0
+EOF
+
+cat >"${STUB_DIR}/systemctl" <<'EOF'
+#!/usr/bin/env bash
+set -euo pipefail
+exit 0
+EOF
+
+cat >"${STUB_DIR}/service" <<'EOF'
+#!/usr/bin/env bash
+set -euo pipefail
+exit 0
+EOF
+
+chmod +x "${STUB_DIR}/apt-get" "${STUB_DIR}/systemctl" "${STUB_DIR}/service"
+
+sleep() { :; }
+
+if repair_nvml_mismatch; then
+  repair_status=0
+else
+  repair_status=$?
+fi
+
+if [[ "$repair_status" -ne 1 ]]; then
+  echo "Expected repair_nvml_mismatch to return 1 when mismatch persists" >&2
+  exit 1
+fi
+
+if [[ ! -s "$APT_CALLED_FILE" ]]; then
+  echo "Expected repair path to invoke apt-get for NVML mismatch" >&2
+  exit 1
+fi
+
+rm -f "$APT_CALLED_FILE"
+
+detect_nvml_mismatch() {
+  return 2
+}
+
+if repair_nvml_mismatch; then
+  repair_status=0
+else
+  repair_status=$?
+fi
+
+if [[ "$repair_status" -ne 1 ]]; then
+  echo "Expected repair_nvml_mismatch to return 1 when detection fails" >&2
+  exit 1
+fi
+
+assert_no_apt_call