diff --git a/.github/workflows/p2p-gpu.yml b/.github/workflows/p2p-gpu.yml new file mode 100644 index 000000000..cab86577c --- /dev/null +++ b/.github/workflows/p2p-gpu.yml @@ -0,0 +1,37 @@ +name: P2P GPU checks + +on: + push: + branches: [main] + paths: + - "dream-server/installers/p2p-gpu/**" + - ".github/workflows/p2p-gpu.yml" + pull_request: + branches: [main] + paths: + - "dream-server/installers/p2p-gpu/**" + - ".github/workflows/p2p-gpu.yml" + +permissions: + contents: read + +jobs: + p2p-gpu: + name: P2P GPU syntax + regression + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Bash syntax check (p2p-gpu) + run: | + shfiles=$(find dream-server/installers/p2p-gpu -name '*.sh' -type f) + if [ -z "$shfiles" ]; then + echo "No .sh files found under dream-server/installers/p2p-gpu" + exit 0 + fi + echo "$shfiles" | xargs bash -n + + - name: NVML mismatch regression + run: | + # Live Vast.ai + GPU validation is performed manually outside CI. + bash dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh diff --git a/dream-server/installers/p2p-gpu/README.md b/dream-server/installers/p2p-gpu/README.md new file mode 100644 index 000000000..372d5abee --- /dev/null +++ b/dream-server/installers/p2p-gpu/README.md @@ -0,0 +1,196 @@ +# P2P GPU Deploy — DreamServer on Peer-to-Peer GPU Marketplaces + +Production-hardened deployment of the full DreamServer AI stack on rented GPU instances from peer-to-peer compute marketplaces (Vast.ai tested; architecture is provider-agnostic). + +**One command. All bundled services. Any NVIDIA/AMD GPU or CPU-only instance.** + +Automatically handles 28 known P2P GPU environment issues: root user rejection, Docker socket permissions, CPU limit overflow, /tmp permissions, NVIDIA toolkit setup, NVML driver/library mismatch, multi-GPU support, SSH tunneling, package manager locks, and more. Includes built-in recovery commands, health checks, and model auto-swap capabilities. + +## What It Solves + +**The Problem:** Deploying DreamServer on rented GPU instances is fragile. Root-only environments, non-standard filesystem permissions, held package locks, missing GPU drivers, and provider-specific quirks cause silent failures during setup. + +**The Solution:** `setup.sh` is a battle-tested orchestrator that detects and fixes the known issues automatically. It handles permission escalation, creates a non-root `dream` user, manages Docker group access, installs missing NVIDIA/AMD toolkits, applies POSIX ACLs for multi-container file sharing, and starts all bundled services (discovered from extension manifests) with health checks. If setup partially completes, recovery commands bring the stack back online without reinstall. + +## Quick Start + +```bash +# On your GPU instance (as root): +bash setup.sh # Full install (~10 min) +bash setup.sh --status # Health check +bash setup.sh --info # Show connection URLs and SSH tunnel commands +bash setup.sh --teardown # Stop all services +``` + +## Setup Guide + +- [Setup Tutorial_Video](https://drive.google.com/file/d/12CY9-KTyCsqRGtyaauqmvsupoh3jocBL/view?usp=sharing) +- [Setup presentation slides](https://docs.google.com/presentation/d/1XbVNV1n04JiOyAIkA6bU5r5A9T7uBnLr/edit?usp=sharing) + +## Quick Recovery (If Phase 9 Fails) + +If setup reached "Starting services" but URLs are unreachable: + +```bash +bash setup.sh --fix +bash setup.sh --status +bash setup.sh --info +``` + +This re-applies CPU caps, permissions, network fixes, restarts compose, and +prints fresh access commands. + +On Windows, use the all-port tunnel from `--info` (it uses a safe local alias +`58080 -> dashboard` plus direct localhost forwards for service ports). + +`--fix` regenerates reconnect scripts: +- `connect-tunnel.sh` (Linux/macOS/WSL) +- `connect-tunnel.ps1` (Windows PowerShell) + +## What It Does + +The setup script handles 28 known issues with P2P GPU environments: + +| # | Issue | Fix | +|---|-------|-----| +| 01 | Root user rejection | Creates non-root `dream` user | +| 02 | Docker socket denied | Adds dream to docker group | +| 03 | /tmp broken | Fixes permissions to 1777 | +| 04 | CPU limit overflow | Auto-caps to actual core count | +| 05 | n8n uid mismatch | Dynamic UID from compose.yaml | +| 06 | dashboard-api write | ACL-based permission system | +| 07 | comfyui models write | AMD/NVIDIA layout detection | +| 08 | WEBUI_SECRET missing | Auto-generated secrets | +| 09 | Dual directory confusion | Smart directory discovery | +| 10 | Dashboard stuck Created | Auto-nudge on startup | +| 11 | HuggingFace throttle | aria2c multi-threaded download | +| 12 | NVIDIA toolkit missing | Auto-installs + configures | +| 13 | Disk space insufficient | Pre-flight validation | +| 14 | Compose v1 syntax | Auto-detects v1 vs v2 | +| 15 | .env duplicates | Idempotent env_set() | +| 16 | Port conflicts | Dynamic port discovery | +| 17 | DNS resolution failure | Google/Cloudflare DNS fallback | +| 18 | /dev/shm too small | Remount /dev/shm to 4GB | +| 19 | Bootstrap model missing | Auto-downloads Qwen3-0.6B | +| 20 | llama-server infinite hang | 45s diagnosis + OOM recovery | +| 21 | No systemd | Host-agent background start | +| 22 | OpenCode crash-loop | Auto-disable non-essential | +| 23 | CUDA OOM on large models | Swap to smallest model | +| 24 | ComfyUI infinite hang | Background download, don't block | +| 25 | Installer hang | 10min cap on the installer run | +| 26 | AMD GPU support | ROCm detection + compose overlay | +| 27 | CPU-only fallback | Works without any GPU | +| 28 | NVML driver/library mismatch | Detect + targeted repair (regression-tested) | + +## Architecture + +``` +p2p-gpu/ +├── setup.sh # Orchestrator — sources libs, runs phases +├── config/ +│ └── service-hints.yaml # p2p-gpu-only manifest overrides (proxy_mode, startup_behavior) +├── lib/ # Pure function libraries (no side effects) +│ ├── constants.sh # Paths, versions, colors, thresholds +│ ├── logging.sh # log/warn/err/step, cleanup trap, flock, dpkg-lock release +│ ├── environment.sh # .env management, GPU detection, HTTP polling +│ ├── permissions.sh # POSIX ACLs, setgid, UID-specific fixes +│ ├── services.sh # Manifest discovery, compose, startup +│ ├── networking.sh # Caddy proxy, SSH tunnel, Cloudflare +│ ├── models.sh # Model download, URL resolution, swap watcher +│ ├── gpu-topology.sh # Per-GPU enumeration, NVLink/PCIe topology, GPU↔service assignment +│ └── compatibility.sh # Whisper/TTS/ComfyUI/OpenClaw fixes +├── phases/ # Sequential install steps +│ ├── 00-preflight.sh # GPU/disk/Docker/DNS validation +│ ├── 01-dependencies.sh # System package installation +│ ├── 02-user-setup.sh # Create dream user + groups +│ ├── 03-repository.sh # Clone DreamServer repo +│ ├── 04-installer.sh # Run DreamServer installer (with timeout) +│ ├── 05-post-install.sh # Apply fixes, locate working directory +│ ├── 06-bootstrap-model.sh # Ensure usable GGUF model exists +│ ├── 07-model-optimize.sh # Resume/restart downloads with aria2c +│ ├── 08-vastai-quirks.sh # Provider-specific environment fixes +│ ├── 09-services.sh # Start containers + health monitoring +│ ├── 10-voice-stack.sh # TTS/STT model readiness gates +│ ├── 11-access-layer.sh # Caddy proxy + Cloudflare tunnel + SSH +│ └── 12-summary.sh # Print access info +├── subcommands/ # Alternative entry points +│ ├── teardown.sh # Stop all services +│ ├── status.sh # Health check dashboard +│ ├── resume.sh # Quick restart after SSH drop +│ ├── fix.sh # Apply fixes without reinstall +│ └── info.sh # Show connection URLs +└── tests/ + └── test-nvml-mismatch.sh # NVML mismatch repair-path regression (run in CI) +``` + +## Design Principles + +Aligned with DreamServer's [CLAUDE.md](../../../CLAUDE.md): + +- **Let It Crash** — `set -euo pipefail` throughout; errors are fatal unless a failure is explicitly tolerated with `|| warn`. Non-essential services degrade independently, so a working dashboard with a degraded ComfyUI beats a dead stack on an instance you're paying for. +- **KISS** — readable over clever; one function, one job. +- **Functional core, imperative shell** — `lib/` holds pure helpers; `phases/` is the imperative shell that runs on source. +- **Manifest-driven** — services are discovered from extension manifests, never a hardcoded list. +- **PID-file process tracking** — background jobs (model downloads, swap watcher, tunnels) are tracked by PID file under `/var/run/dreamserver-p2p-gpu/` and stopped by PID. +- **ACL-primary permissions** — shared-data directories use setgid + POSIX ACLs as their only sharing mechanism. Failures on those paths abort the install (`exit 1`) rather than degrading to world-writable permissions; per-extension ACLs are applied independently so one extension's failure doesn't block the rest. + +## Commands + +| Command | Purpose | +|---------|---------| +| `bash setup.sh` | Full install (first time or re-install) | +| `bash setup.sh --resume` | Quick restart — re-apply fixes + start services | +| `bash setup.sh --status` | Health check — GPU, containers, ports | +| `bash setup.sh --info` | Show connection URLs and SSH tunnel commands | +| `bash setup.sh --fix` | Apply latest fixes without full reinstall | +| `bash setup.sh --teardown` | Stop all services | +| `bash setup.sh --dry-run` | Preview what would happen without making changes | + +## Model Download and Auto-Swap + +- Setup starts quickly on a small model, downloads the GPU-tier model in background, then auto-swaps when ready. +- Swap updates both `GGUF_FILE` and `LLM_MODEL`, then restarts dependent services. +- Dashboard model downloads (`/models` page) require the Dream host agent; setup auto-starts it during service startup. + +```bash +MODEL="Qwen3-30B-A3B-Q4_K_M.gguf"; DS_DIR="${DS_DIR:-/home/dream/dream-server}"; LLM_MODEL="$(echo "$MODEL" | sed -E 's/\.(gguf|GGUF)$//' | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' | tr '[:upper:]' '[:lower:]')"; cd "$DS_DIR" && sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${MODEL}|" .env && { grep -q '^LLM_MODEL=' .env && sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${LLM_MODEL}|" .env || echo "LLM_MODEL=${LLM_MODEL}" >> .env; } && docker compose $(cat .compose-flags 2>/dev/null) up -d llama-server && for c in dream-dreamforge dream-openclaw dream-dashboard-api dream-webui; do docker ps --format '{{.Names}}' | grep -qx "$c" && docker restart "$c" >/dev/null || echo "[warn] ${c} restart failed (non-fatal)" >&2; done +``` + +```bash +tail -f /home/dream/dream-server/logs/aria2c-download.log +``` + +```bash +# If Dashboard shows "Failed to start download" +su - dream -c 'cd /home/dream/dream-server && DREAM_HOME=/home/dream/dream-server ./dream-cli agent start' +``` + +## Provider Support + +Currently tested on **Vast.ai**. The architecture is provider-agnostic: +- GPU detection works for any NVIDIA/AMD/CPU-only instance +- Docker + compose requirements are standard +- Provider-specific quirks isolated in `phases/08-vastai-quirks.sh` + +The active provider is selected by `PROVIDER_NAME` (override with `P2P_GPU_PROVIDER` +before running). To add a new provider, create `phases/08--quirks.sh` with +provider-specific fixes. + +## Security + +- `.env` files created with `0660` mode, owned `dream:dream` — readable by the `dream` group the containers run under, never world-readable +- SSH private keys forced to `0600` +- Background process PIDs tracked in `/var/run/dreamserver-p2p-gpu/` +- Cloudflare tokens passed via environment variables (not CLI args) +- `cloudflared` binary verified against the upstream SHA256 when the checksum file is reachable; on mismatch the tunnel is skipped +- POSIX ACLs required; world-writable permissions are never used +- Multi-UID directories documented with reasons for broader access + +## Related + +- [`../../../README.md`](../../../README.md) — DreamServer project overview +- [`../../../CLAUDE.md`](../../../CLAUDE.md) — design philosophy and error-handling rules +- [`../../docs/INSTALLER-ARCHITECTURE.md`](../../docs/INSTALLER-ARCHITECTURE.md) — installer module map and header convention +- [`../../docs/EXTENSIONS.md`](../../docs/EXTENSIONS.md) — service/extension manifest model +- [`../../CONTRIBUTING.md`](../../CONTRIBUTING.md) — contribution and validation guide +- [`../../../SECURITY.md`](../../../SECURITY.md) — security policy and disclosure \ No newline at end of file diff --git a/dream-server/installers/p2p-gpu/config/service-hints.yaml b/dream-server/installers/p2p-gpu/config/service-hints.yaml new file mode 100644 index 000000000..c68b3c49f --- /dev/null +++ b/dream-server/installers/p2p-gpu/config/service-hints.yaml @@ -0,0 +1,23 @@ +# P2P GPU deployment hints — service-specific overrides for the setup script. +# These supplement manifest.yaml defaults ONLY within the p2p-gpu context. +# When upstream adopts proxy_mode/startup_behavior as first-class manifest +# fields, delete this file and remove the hints merge in lib/services.sh. + +comfyui: + proxy_mode: root + startup_behavior: heavy + +dashboard: + proxy_mode: root + +open-webui: + proxy_mode: root + +perplexica: + startup_behavior: heavy + +tts: + startup_behavior: heavy + +whisper: + startup_behavior: heavy diff --git a/dream-server/installers/p2p-gpu/lib/compatibility.sh b/dream-server/installers/p2p-gpu/lib/compatibility.sh new file mode 100644 index 000000000..c0838c7f3 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/compatibility.sh @@ -0,0 +1,391 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Compatibility Fixes +# ============================================================================ +# Part of: p2p-gpu/lib/ +# Purpose: Service-specific compatibility patches for Whisper, TTS, ComfyUI, +# and OpenClaw running on Vast.ai instances +# +# Expects: LOGFILE, log(), warn(), env_get(), wait_for_http() +# Provides: ensure_whisper_ui_compatibility(), ensure_webui_stt_model_alignment(), +# map_whisper_model_id(), ensure_whisper_asr_model(), ensure_tts_model_ready(), +# fix_comfyui_permissions(), comfyui_preload_models(), +# patch_openclaw_inject_token_runtime() +# +# Modder notes: +# These are narrow fixes for known Vast.ai failure modes. Each function +# is idempotent and safe to re-run. +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# Fix Whisper UI internal API connectivity + entrypoint executable bit +ensure_whisper_ui_compatibility() { + local ds_dir="$1" + local whisper_compose="${ds_dir}/extensions/services/whisper/compose.yaml" + local whisper_entrypoint="${ds_dir}/extensions/services/whisper/docker-entrypoint.sh" + + if [[ -f "$whisper_entrypoint" ]]; then + # [NON-FATAL: whisper] Entry point permissions only affect Whisper UI. + chmod 755 "$whisper_entrypoint" || warn "whisper entrypoint chmod failed (non-fatal)" + fi + + [[ ! -f "$whisper_compose" ]] && return 0 + + if ! grep -q 'LOOPBACK_HOST_URL=' "$whisper_compose"; then + if grep -q 'WHISPER__TTL=' "$whisper_compose"; then + sed -i '/WHISPER__TTL=/a\ - LOOPBACK_HOST_URL=http://127.0.0.1:8000\n - CHAT_COMPLETION_BASE_URL=http://llama-server:8080/v1\n - CHAT_COMPLETION_API_KEY=cant-be-empty' \ + "$whisper_compose" + log "Injected Whisper UI loopback compatibility env" + else + warn "Whisper compose env block not found — skipped loopback injection" + fi + fi +} + +# Keep Open WebUI STT model aligned with the Whisper model we bootstrap. +# Fixes mismatch where WebUI requests a model that Whisper does not have. +ensure_webui_stt_model_alignment() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local nvidia_overlay="${ds_dir}/docker-compose.nvidia.yml" + [[ ! -f "$nvidia_overlay" ]] && return 0 + + local whisper_cfg model_id current + whisper_cfg="$(env_get "$env_file" "WHISPER_MODEL")" + model_id="$(map_whisper_model_id "$whisper_cfg")" + [[ -z "$model_id" ]] && model_id="Systran/faster-whisper-base" + + current=$(grep -E 'AUDIO_STT_MODEL:' "$nvidia_overlay" | head -1 | sed -E 's/.*AUDIO_STT_MODEL:\s*"?(.*)"?/\1/' || echo "") + [[ "$current" == "$model_id" ]] && return 0 + + # Preserve existing indentation to avoid corrupting YAML structure. + sed -i -E "s|^([[:space:]]*)AUDIO_STT_MODEL:.*|\1AUDIO_STT_MODEL: \"${model_id}\"|" "$nvidia_overlay" + log "Aligned Open WebUI STT model to ${model_id}" +} + +# Map friendly WHISPER_MODEL values to Speaches-compatible model IDs +map_whisper_model_id() { + local raw="$1" + case "${raw,,}" in + tiny|tiny.en) echo "Systran/faster-whisper-tiny" ;; + base|base.en|"") echo "Systran/faster-whisper-base" ;; + small|small.en) echo "Systran/faster-whisper-small" ;; + medium|medium.en) echo "Systran/faster-whisper-medium" ;; + large|large-v2|large-v3) echo "Systran/faster-whisper-large-v3" ;; + turbo|large-v3-turbo) echo "deepdml/faster-whisper-large-v3-turbo-ct2" ;; + */*) echo "$raw" ;; + *) echo "Systran/faster-whisper-base" ;; + esac +} + +# Ensure at least one ASR model is loaded in Whisper +ensure_whisper_asr_model() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local whisper_port + whisper_port="$(env_get "$env_file" "WHISPER_PORT")" + whisper_port="${whisper_port:-9000}" + + if ! wait_for_http "http://127.0.0.1:${whisper_port}/health" 120 4; then + warn "Whisper not reachable on port ${whisper_port} — skipping ASR bootstrap" + return 0 + fi + + local asr_count + asr_count=$(curl -sf --max-time 12 \ + "http://127.0.0.1:${whisper_port}/v1/models?task=automatic-speech-recognition" \ + | jq -r '.data | length' || echo 0) + + if [[ "$asr_count" =~ ^[0-9]+$ ]] && [[ "$asr_count" -gt 0 ]]; then + log "Whisper ASR models already available (${asr_count})" + return 0 + fi + + local whisper_cfg model_id encoded_model + whisper_cfg="$(env_get "$env_file" "WHISPER_MODEL")" + model_id="$(map_whisper_model_id "$whisper_cfg")" + encoded_model="${model_id//\//%2F}" + + warn "No ASR models — bootstrapping ${model_id}" + curl -sf -X POST --max-time 30 \ + "http://127.0.0.1:${whisper_port}/v1/models/${encoded_model}" > /dev/null \ + || { warn "Could not trigger Whisper model download for ${model_id}"; return 0; } + + _wait_for_asr "$whisper_port" +} + +_wait_for_asr() { + local whisper_port="$1" + local waited=0 + while [[ $waited -lt 180 ]]; do + local asr_count + asr_count=$(curl -sf --max-time 12 \ + "http://127.0.0.1:${whisper_port}/v1/models?task=automatic-speech-recognition" \ + | jq -r '.data | length' || echo 0) + if [[ "$asr_count" =~ ^[0-9]+$ ]] && [[ "$asr_count" -gt 0 ]]; then + log "Whisper ASR model bootstrap complete (${asr_count} model(s))" + return 0 + fi + sleep 6 + waited=$((waited + 6)) + done + warn "Whisper model download started but not ready — will appear shortly" +} + +# Wait for Kokoro TTS to load at least one voice model +ensure_tts_model_ready() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local tts_port + tts_port="$(env_get "$env_file" "TTS_PORT")" + tts_port="${tts_port:-8880}" + + if ! docker ps --format '{{.Names}}' | grep -q 'dream-tts'; then + return 0 + fi + + if ! wait_for_http "http://127.0.0.1:${tts_port}/health" 90 4; then + warn "Kokoro TTS not reachable on port ${tts_port} — skipping" + return 0 + fi + + local voice_count + voice_count=$(curl -sf --max-time 10 "http://127.0.0.1:${tts_port}/v1/audio/voices" \ + | jq -r 'if type == "array" then length elif .voices then (.voices | length) else 0 end' \ + || echo 0) + + if [[ "$voice_count" =~ ^[0-9]+$ ]] && [[ "$voice_count" -gt 0 ]]; then + log "Kokoro TTS ready (${voice_count} voice(s))" + return 0 + fi + + warn "Kokoro TTS starting — waiting for voice model..." + _wait_for_tts "$tts_port" +} + +_wait_for_tts() { + local tts_port="$1" + local waited=0 + while [[ $waited -lt 90 ]]; do + local voice_count + voice_count=$(curl -sf --max-time 10 "http://127.0.0.1:${tts_port}/v1/models" \ + | jq -r '.data | length' || echo 0) + if [[ "$voice_count" =~ ^[0-9]+$ ]] && [[ "$voice_count" -gt 0 ]]; then + log "Kokoro TTS model loaded (${voice_count} model(s))" + return 0 + fi + sleep 6 + waited=$((waited + 6)) + done + warn "Kokoro TTS model still loading — will be available shortly" +} + +# Fix ComfyUI permissions for AMD vs NVIDIA mount layouts +fix_comfyui_permissions() { + local data_dir="$1" + local gpu_backend="${2:-nvidia}" + + local dirs + if [[ "$gpu_backend" == "amd" ]]; then + dirs=("${data_dir}/comfyui/ComfyUI/models" + "${data_dir}/comfyui/ComfyUI/output" + "${data_dir}/comfyui/ComfyUI/input" + "${data_dir}/comfyui/ComfyUI/custom_nodes") + else + dirs=("${data_dir}/comfyui/models" + "${data_dir}/comfyui/output" + "${data_dir}/comfyui/input" + "${data_dir}/comfyui/workflows") + fi + + for d in "${dirs[@]}"; do + mkdir -p "$d" || { warn "comfyui mkdir failed on ${d} (non-fatal)"; continue; } + # [NON-FATAL: comfyui] ComfyUI will fail its own healthcheck if ACLs remain broken. + chmod 2775 "$d" && setfacl -R -d -m "u::rwx,u:$(id -u comfyui 2>>"$LOGFILE" || echo 1000):rwx,g::rwx,o::rx" "$d" \ + || warn "comfyui ACL failed on ${d} (non-fatal)" + done +} + +# Download user-specified ComfyUI models from COMFYUI_EXTRA_MODELS env var +comfyui_preload_models() { + local ds_dir="$1" + local gpu_backend="${2:-nvidia}" + local env_file="${ds_dir}/.env" + local data_dir="${ds_dir}/data" + + local extra_models + extra_models="$(env_get "$env_file" "COMFYUI_EXTRA_MODELS")" + [[ -z "$extra_models" ]] && return 0 + + local models_root + if [[ "$gpu_backend" == "amd" ]]; then + models_root="${data_dir}/comfyui/ComfyUI/models" + else + models_root="${data_dir}/comfyui/models" + fi + mkdir -p "$models_root" + + log "Processing ComfyUI extra models..." + echo "$extra_models" | tr ';' '\n' | while IFS='|' read -r url target; do + url=$(echo "$url" | xargs) + target=$(echo "$target" | xargs) + [[ -z "$url" || -z "$target" ]] && continue + _download_comfyui_model "$models_root" "$url" "$target" + done + + apply_data_acl "$models_root" + log "ComfyUI model preload complete" +} + +_download_comfyui_model() { + local models_root="$1" url="$2" target="$3" + local dest="${models_root}/${target}" + local dest_dir + dest_dir="$(dirname "$dest")" + mkdir -p "$dest_dir" + + [[ -f "$dest" ]] && { log " Already exists: ${target}"; return 0; } + + log " Downloading: ${target}..." + if command -v aria2c &>/dev/null; then + # [NON-FATAL: comfyui] Optional extra model download failures should not block install. + aria2c -x 4 -s 4 -k 5M --file-allocation=none --console-log-level=warn \ + -d "$dest_dir" -o "$(basename "$dest")" "$url" 2>&1 | tail -3 \ + || warn " Failed to download ${target} (non-fatal)" + else + # [NON-FATAL: comfyui] Optional extra model download failures should not block install. + curl -L --progress-bar -o "$dest" "$url" \ + || warn " Failed to download ${target} (non-fatal)" + fi +} + +# Patch OpenClaw's inject-token.js for model reference compatibility +patch_openclaw_inject_token_runtime() { + local ds_dir="$1" + local target="${ds_dir}/config/openclaw/inject-token.js" + + [[ ! -f "$target" ]] && return 0 + if ! command -v perl &>/dev/null; then + warn "perl missing — cannot patch OpenClaw injector" + return 0 + fi + + # Already patched? Keep idempotent. + if grep -q "const providerMap = config.models?.providers || config.providers || null;" "$target" \ + && grep -q "firstModel.name = LLM_MODEL;" "$target" \ + && grep -q "updated legacy agent model refs ->" "$target"; then + log "OpenClaw injector patch already present: ${target}" + return 0 + fi + + local before_hash + before_hash=$(sha256sum "$target" | awk '{print $1}' || echo "") + + local subs + subs=$(perl -0777 -i - "$target" <<'PERL' +my $replacement = <<'JS'; + // Fix model references to match what llama-server actually serves + if (LLM_MODEL) { + const providerMap = config.models?.providers || config.providers || null; + const providerName = providerMap ? Object.keys(providerMap)[0] : null; + + if (providerName && providerMap[providerName]) { + const provider = providerMap[providerName]; + const ollamaUrl = process.env.OLLAMA_URL || ''; + const litellmKey = process.env.LITELLM_KEY || ''; + if (ollamaUrl) { + const newBase = ollamaUrl.replace(/\/$/, '') + '/v1'; + if (provider.baseUrl !== newBase) { + console.log(`[inject-token] updated provider baseUrl: ${provider.baseUrl} -> ${newBase}`); + provider.baseUrl = newBase; + } + if (litellmKey && provider.apiKey !== litellmKey) { + provider.apiKey = litellmKey; + console.log(`[inject-token] updated provider apiKey from env`); + } + } + + if (Array.isArray(provider.models) && provider.models.length > 0) { + const firstModel = provider.models[0]; + if (firstModel && typeof firstModel === 'object') { + const oldValue = firstModel.name || firstModel.id || ''; + if (firstModel.name !== LLM_MODEL || firstModel.id !== LLM_MODEL) { + firstModel.name = LLM_MODEL; + firstModel.id = LLM_MODEL; + console.log(`[inject-token] updated provider model: ${oldValue} -> ${LLM_MODEL}`); + } + } + } + } + + if (config.agents?.defaults) { + const d = config.agents.defaults; + const fullOld = d.model?.primary || ''; + if (fullOld && providerName) { + const fullNew = `${providerName}/${LLM_MODEL}`; + if (fullOld !== fullNew) { + d.model = { primary: fullNew }; + d.models = { [fullNew]: {} }; + if (d.subagents) d.subagents.model = fullNew; + console.log(`[inject-token] updated agent model refs: ${fullOld} -> ${fullNew}`); + } + } + } + + if (config.agent && providerName) { + const fullNew = `${providerName}/${LLM_MODEL}`; + if (config.agent.model !== fullNew) { + config.agent.model = fullNew; + if (config.subagent) config.subagent.model = fullNew; + console.log(`[inject-token] updated legacy agent model refs -> ${fullNew}`); + } + } + } + + // Override LLM baseUrl for Token Spy monitoring (if OPENCLAW_LLM_URL is set) +JS + +my $n = s{ +\Q // Fix model references to match what llama-server actually serves + if (LLM_MODEL) {\E +.*? +\Q } + + // Override LLM baseUrl for Token Spy monitoring (if OPENCLAW_LLM_URL is set)\E +}{$replacement}sx; + +print $n; +PERL +) + + _verify_openclaw_patch "$target" "$before_hash" "${subs:-0}" +} + +_verify_openclaw_patch() { + local target="$1" before_hash="$2" subs="$3" + + if [[ "$subs" -eq 0 ]]; then + if grep -q "const providerMap = config.models?.providers || config.providers || null;" "$target"; then + log "OpenClaw injector patch already present: ${target}" + else + warn "OpenClaw injector patch pattern not found in ${target} — leaving unchanged" + fi + return 0 + fi + + if grep -q "const providerMap = config.models?.providers || config.providers || null;" "$target" \ + && grep -q "firstModel.name = LLM_MODEL;" "$target"; then + local after_hash + after_hash=$(sha256sum "$target" | awk '{print $1}' || echo "") + if [[ "$before_hash" != "$after_hash" ]]; then + log "Patched OpenClaw injector: ${target}" + else + log "OpenClaw injector patch already present: ${target}" + fi + else + warn "OpenClaw injector patch could not be verified: ${target}" + fi +} diff --git a/dream-server/installers/p2p-gpu/lib/constants.sh b/dream-server/installers/p2p-gpu/lib/constants.sh new file mode 100644 index 000000000..18471cd51 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/constants.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Deploy Constants +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/lib/ +# Purpose: Readonly variables, colors, paths, thresholds +# +# Expects: (nothing — first file sourced) +# Provides: P2P_GPU_VERSION, PROVIDER_NAME, DREAM_USER, DREAM_HOME, +# REPO_URL, REPO_BRANCH, MIN_DISK_GB, MIN_VRAM_MB, +# LOCKFILE, LOGFILE, PIDFILE_DIR, color codes +# +# Modder notes: +# All constants are readonly. Override via env vars BEFORE sourcing. +# Variables are consumed by other files sourced after this one. +# To add a new provider: create providers/.sh, set PROVIDER_NAME. +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +# shellcheck disable=SC2034 # Variables used by sourcing scripts +set -euo pipefail + +readonly P2P_GPU_VERSION="6.1.0" +# Back-compat alias for phases that reference the old name +readonly VASTAI_VERSION="$P2P_GPU_VERSION" +readonly PROVIDER_NAME="${P2P_GPU_PROVIDER:-vastai}" +readonly LOCKFILE="/tmp/dreamserver-p2p-gpu-setup.lock" +readonly LOGFILE="/var/log/dreamserver-p2p-gpu-setup.log" +readonly PIDFILE_DIR="/var/run/dreamserver-p2p-gpu" + +readonly DREAM_USER="dream" +readonly DREAM_HOME="/home/${DREAM_USER}" +readonly REPO_URL="https://github.com/Light-Heart-Labs/DreamServer.git" +readonly REPO_BRANCH="main" +readonly MIN_DISK_GB=40 +readonly MIN_VRAM_MB=8000 +readonly INSTALLER_TIMEOUT="${INSTALLER_TIMEOUT:-600}" +readonly MULTIGPU_MIN_GPUS=2 + +# ── Colors ────────────────────────────────────────────────────────────────── +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly CYAN='\033[0;36m' +readonly BOLD='\033[1m' +readonly DIM='\033[2m' +readonly NC='\033[0m' diff --git a/dream-server/installers/p2p-gpu/lib/environment.sh b/dream-server/installers/p2p-gpu/lib/environment.sh new file mode 100644 index 000000000..7b98516c7 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/environment.sh @@ -0,0 +1,908 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Environment Helpers +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/lib/ +# Purpose: .env management, port checks, directory discovery, CPU capping, +# ownership fixes, HTTP polling, GPU detection, post-install orchestrator +# +# Expects: DREAM_USER, DREAM_HOME, LOGFILE, log(), warn(), err() +# Provides: env_set(), env_get(), port_in_use(), find_dream_dir(), +# ensure_dream_cli_command(), +# cap_cpu_in_yaml(), cap_cpu_in_files(), get_compose_cpu_ceiling(), +# compute_safe_cpu_cap(), fix_ownership(), wait_for_http(), +# detect_gpu(), _cap_context_for_vram(), apply_post_install_fixes() +# +# Modder notes: +# env_set is idempotent — safe to call multiple times with same key. +# env_set creates .env with 0660 mode to protect secrets and allow dream user access. +# find_dream_dir checks both expected DreamServer install paths. +# detect_gpu() is the single source of truth for GPU detection — +# call it once and reuse the result (avoid duplicate detection). +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# ── [FIX: env-perms] .env management with proper file permissions ─────────── + +# Set a key in .env idempotently (no duplicates, preserves inode) +# Creates with 0660 to protect secrets (WEBUI_SECRET, API keys, etc.) and allow dream user +env_set() { + local file="$1" key="$2" value="$3" + if [[ ! -f "$file" ]]; then + install -m 0660 -o "${DREAM_USER:-root}" -g "${DREAM_USER:-root}" /dev/null "$file" + fi + if grep -q "^${key}=" "$file"; then + # Escape sed delimiter in value to prevent breakage + local escaped_value="${value//|/\\|}" + sed -i "s|^${key}=.*|${key}=${escaped_value}|" "$file" + else + echo "${key}=${value}" >> "$file" + fi +} + +# Read a key from .env +env_get() { + local file="$1" key="$2" + [[ ! -f "$file" ]] && return 0 + grep "^${key}=" "$file" 2>>"$LOGFILE" | head -1 | cut -d= -f2- \ + | sed 's/[[:space:]]#.*$//' | tr -d '"' | tr -d "'" || echo "" +} + +# Check if a TCP port is in use +port_in_use() { + local port="$1" + ss -tlnp 2>&1 | grep -q ":${port} " +} + +# Locate the active dream-server working directory +find_dream_dir() { + local candidate + # Prefer directory with both .env and compose (fully configured) + for candidate in "${DREAM_HOME}/dream-server" "${DREAM_HOME}/DreamServer/dream-server"; do + if [[ -f "${candidate}/.env" && -f "${candidate}/docker-compose.base.yml" ]]; then + echo "$candidate" + return 0 + fi + done + # Fallback: any existing directory (partially configured) + for candidate in "${DREAM_HOME}/dream-server" "${DREAM_HOME}/DreamServer/dream-server"; do + if [[ -d "$candidate" ]]; then + echo "$candidate" + return 0 + fi + done + return 1 +} + +# Install a stable `dream` command wrapper for root/non-root shells. +ensure_dream_cli_command() { + local ds_dir="$1" + local cli_path="${ds_dir}/dream-cli" + local wrapper="/usr/local/bin/dream" + + if [[ ! -x "$cli_path" ]]; then + warn "dream-cli not executable at ${cli_path} (skipping global dream command)" + return 0 + fi + + cat > "$wrapper" << EOF +#!/usr/bin/env bash +set -euo pipefail +export DREAM_HOME="\${DREAM_HOME:-${ds_dir}}" +cd "${ds_dir}" || exit 1 +exec "${cli_path}" "\$@" +EOF + # [NON-FATAL: convenience] Missing wrapper only affects global dream alias. + chmod +x "$wrapper" || warn "chmod failed on ${wrapper} (non-fatal)" + log "Installed global dream command: ${wrapper}" +} + +# Cap CPU values in one YAML file to max_cpu. +# Handles any numeric form (N, N.M) with optional quotes. Values <= max_cpu +# are left alone; values > max_cpu are lowered to max_cpu. +_cap_cpu_in_yaml_file() { + local file="$1" max_cpu="$2" + [[ ! -f "$file" ]] && return 0 + python3 - "$file" "$max_cpu" <<'PY' +import re, sys +path, cap = sys.argv[1], float(sys.argv[2]) +try: + with open(path, "r", encoding="utf-8") as fh: + src = fh.read() +except OSError: + sys.exit(0) + +def parse_numeric(value): + raw = value.strip().strip("'\"") + if re.fullmatch(r"[0-9]+(?:\.[0-9]+)?", raw): + return float(raw) + m = re.fullmatch(r"\$\{[^:}]+:-([0-9]+(?:\.[0-9]+)?)\}", raw) + if m: + return float(m.group(1)) + return None + +def repl(m): + indent, rhs, comment = m.group(1), m.group(2).strip(), m.group(3) or "" + q = "'" + if rhs[:1] in ("'", '"'): + q = rhs[0] + + numeric = parse_numeric(rhs) + needs_cap = ("${" in rhs) or (numeric is None) or (numeric > cap) + if needs_cap: + return f"{indent}cpus: {q}{cap:g}{q}{comment}" + return m.group(0) + +pat = re.compile(r"^(\s*)cpus:\s*([^#\n]+?)(\s+#.*)?$", re.M) +new = pat.sub(repl, src) +if new != src: + with open(path, "w", encoding="utf-8") as fh: + fh.write(new) +PY +} + +# Cap CPU values in all YAML files under a directory tree. +cap_cpu_in_yaml() { + local dir="$1" max_cpu="$2" + while IFS= read -r -d '' f; do + _cap_cpu_in_yaml_file "$f" "$max_cpu" + done < <(find "$dir" \( -name "*.yml" -o -name "*.yaml" \) -type f -print0) + return 0 +} + +# Cap CPU values in a specific list of YAML files. +cap_cpu_in_files() { + local max_cpu="$1" + shift + local f + for f in "$@"; do + _cap_cpu_in_yaml_file "$f" "$max_cpu" + done + return 0 +} + +# Return the CPU ceiling Docker can actually schedule, accounting for +# container-level CPU quotas that can differ from nproc. +get_compose_cpu_ceiling() { + local host_nproc docker_ncpu ceiling + + host_nproc=$(nproc 2>>"$LOGFILE" || echo 1) + if [[ ! "$host_nproc" =~ ^[0-9]+$ ]] || [[ "$host_nproc" -lt 1 ]]; then + host_nproc=1 + fi + + ceiling="$host_nproc" + docker_ncpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "") + if [[ "$docker_ncpu" =~ ^[0-9]+$ ]] && [[ "$docker_ncpu" -gt 0 ]] && [[ "$docker_ncpu" -lt "$ceiling" ]]; then + ceiling="$docker_ncpu" + fi + + echo "$ceiling" +} + +# Compute a safe cpus: cap value with one-core headroom. +# Optional arg 1: hard ceiling discovered from daemon error output. +compute_safe_cpu_cap() { + local forced_ceiling="${1:-}" + local ceiling + + ceiling=$(get_compose_cpu_ceiling) + if [[ "$forced_ceiling" =~ ^[0-9]+$ ]] && [[ "$forced_ceiling" -gt 0 ]] && [[ "$forced_ceiling" -lt "$ceiling" ]]; then + ceiling="$forced_ceiling" + fi + + if [[ "$ceiling" -gt 1 ]]; then + echo $((ceiling - 1)) + else + echo 1 + fi +} + +# Fix ownership recursively (unconditional to catch nested root-owned files) +fix_ownership() { + local dir="$1" user="$2" group="${3:-$2}" + [[ ! -d "$dir" ]] && return 0 + # Always apply chown recursively to fix root-owned files inside target-owned directories + # chown may fail on NFS mounts or in containers without CAP_CHOWN + if ! chown -R "${user}:${group}" "$dir" 2>>"$LOGFILE"; then + warn "chown failed on ${dir} (non-fatal; host may block ownership changes)" + fi +} + +# Wait for a URL to return HTTP 200 +wait_for_http() { + local url="$1" timeout="${2:-60}" interval="${3:-5}" + local elapsed=0 + while [[ $elapsed -lt $timeout ]]; do + if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then + return 0 + fi + sleep "$interval" + elapsed=$((elapsed + interval)) + done + return 1 +} + +# ── [FIX: gpu-dedup] Single source of truth for GPU detection ─────────────── +# Sets GPU_BACKEND, GPU_NAME, GPU_VRAM, GPU_COUNT as globals. +# Call once in preflight; all other code reads these variables. +detect_gpu() { + GPU_BACKEND="cpu" + GPU_NAME="none" + GPU_VRAM="0" + GPU_COUNT=0 + GPU_TOTAL_VRAM=0 + + if command -v nvidia-smi &>/dev/null && nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null 2>&1; then + GPU_BACKEND="nvidia" + GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | sed -n '1p' | xargs) + GPU_VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE" | sed -n '1p' | xargs) + GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | wc -l) + GPU_TOTAL_VRAM=0 + while read -r v; do GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + v )); done \ + < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE") + if [[ $GPU_TOTAL_VRAM -eq 0 ]]; then GPU_TOTAL_VRAM=$GPU_VRAM; fi + + elif command -v rocm-smi &>/dev/null || [[ -e /dev/kfd ]]; then + GPU_BACKEND="amd" + GPU_NAME=$(rocm-smi --showproductname 2>>"$LOGFILE" | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU") + GPU_VRAM=$(rocm-smi --showmeminfo vram 2>>"$LOGFILE" | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0") + # Convert bytes to MiB + if [[ "${GPU_VRAM:-0}" -gt 1000000 ]]; then + GPU_VRAM=$(( GPU_VRAM / 1048576 )) + fi + GPU_COUNT=$(rocm-smi --showid 2>>"$LOGFILE" | grep -c 'GPU\[' || echo 1) + if [[ $GPU_COUNT -ge 2 ]]; then + GPU_TOTAL_VRAM=$(( GPU_VRAM * GPU_COUNT )) # rocm-smi per-device sum + else + GPU_TOTAL_VRAM=$GPU_VRAM + fi + fi + + # Pin packages after successful detection to prevent future mismatches + if [[ "$GPU_BACKEND" == "nvidia" ]]; then + _pin_nvidia_packages + fi +} + +# Lightweight backend-only detection (for subcommands that don't need full GPU info) +detect_gpu_backend() { + if command -v nvidia-smi &>/dev/null && nvidia-smi &>/dev/null; then + echo "nvidia" + elif command -v rocm-smi &>/dev/null || [[ -e /dev/kfd ]]; then + echo "amd" + else + echo "cpu" + fi +} + +_has_nvml_mismatch_signature() { + local output="${1:-}" + echo "$output" | grep -Eqi \ + "driver/library version mismatch|failed to initialize nvml|nvidia-container-cli: initialization error: nvml error" +} + +# ── [FIX: nvml-mismatch] NVIDIA driver/library version mismatch detection ──── +# Detects if host NVIDIA driver and container CUDA driver versions are misaligned. +# Returns: 0 = matched, 1 = mismatched, 2 = couldn't detect +# Outputs: diagnostics to stdout (host_driver=X.X container_cuda=Y.Y) +detect_nvml_mismatch() { + local host_driver container_cuda docker_test_image="${1:-nvidia/cuda:12.4.1-base-ubuntu22.04}" + local test_timeout="${NVIDIA_DOCKER_TEST_TIMEOUT:-180}" + local host_probe_output host_probe_rc container_probe_output container_probe_rc + + # Get host driver version + host_probe_output=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>&1) && host_probe_rc=0 || host_probe_rc=$? + [[ -n "$host_probe_output" ]] && printf '%s\n' "$host_probe_output" >> "$LOGFILE" + + if [[ $host_probe_rc -eq 0 ]]; then + host_driver=$(echo "$host_probe_output" | head -1 | xargs || echo "") + elif _has_nvml_mismatch_signature "$host_probe_output"; then + log "NVIDIA host probe reported NVML driver/library mismatch" + return 1 + else + host_driver="" + fi + + if [[ -z "$host_driver" ]]; then + log "NVIDIA driver version detection failed (non-fatal)" + return 2 + fi + + # Get container CUDA driver compatibility version + container_probe_output=$(timeout --signal=TERM "$test_timeout" \ + docker run --rm --gpus all "$docker_test_image" \ + nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>&1) && container_probe_rc=0 || container_probe_rc=$? + [[ -n "$container_probe_output" ]] && printf '%s\n' "$container_probe_output" >> "$LOGFILE" + + if [[ $container_probe_rc -eq 0 ]]; then + container_cuda=$(echo "$container_probe_output" | head -1 | xargs || echo "") + elif _has_nvml_mismatch_signature "$container_probe_output"; then + log "NVIDIA container probe reported NVML driver/library mismatch" + return 1 + else + container_cuda="" + fi + + if [[ -z "$container_cuda" ]]; then + log "Container CUDA driver detection failed (non-fatal)" + return 2 + fi + + # Compare major.minor versions (e.g., 535.104.05 → 535.104) + local host_major_minor container_major_minor + host_major_minor=$(echo "$host_driver" | cut -d. -f1,2) + container_major_minor=$(echo "$container_cuda" | cut -d. -f1,2) + + log "NVIDIA driver mismatch check: host=${host_driver} (${host_major_minor}) vs container=${container_cuda} (${container_major_minor})" + + if [[ "$host_major_minor" != "$container_major_minor" ]]; then + log "NVIDIA driver/library MISMATCH detected: host ${host_driver} != container ${container_cuda}" + return 1 + fi + + log "NVIDIA driver/library versions aligned (${host_major_minor})" + return 0 +} + +# ── [FIX: nvml-mismatch] Multi-strategy NVIDIA driver/library mismatch repair ── +# Strategy 1: Reload kernel modules (fastest, no packages needed) +# Strategy 2: Downgrade userspace libs to match kernel module version +# Strategy 3: Upgrade kernel module to match userspace libs (original approach) +# Non-fatal: logs warnings on failure but does not halt. +repair_nvml_mismatch() { + local host_probe_output kernel_version="" lib_version="" initial_status post_repair_status + + log "Attempting to repair NVIDIA driver/library mismatch..." + + detect_nvml_mismatch && initial_status=0 || initial_status=$? + if [[ $initial_status -eq 0 ]]; then + log "No mismatch detected, skipping repair" + return 0 + elif [[ $initial_status -eq 2 ]]; then + # [NON-FATAL: probe] NVML probe may fail on transient driver issues. + host_probe_output=$(nvidia-smi 2>&1) || warn "nvidia-smi probe failed (non-fatal)" + if _has_nvml_mismatch_signature "$host_probe_output"; then + warn "NVIDIA host probe reports driver/library mismatch — forcing repair attempt" + else + warn "Unable to detect NVIDIA driver/library mismatch state (skipping repair)" + return 1 + fi + fi + + # Get kernel module version (the version that's actually loaded) + if [[ -f /proc/driver/nvidia/version ]]; then + kernel_version="$(grep -oP 'Kernel Module\s+\K[0-9.]+' /proc/driver/nvidia/version || echo "")" + fi + if [[ -z "${kernel_version:-}" ]] && [[ -f /sys/module/nvidia/version ]]; then + kernel_version="$(cat /sys/module/nvidia/version 2>/dev/null || echo "")" # stderr expected: file may not exist + fi + + # Get NVML library version from nvidia-smi error output + lib_version="$(nvidia-smi 2>&1 | grep -oP 'NVML library version:\s*\K[0-9.]+' || echo "")" + + if [[ -n "$kernel_version" ]]; then + log "Kernel module version: ${kernel_version}" + fi + if [[ -n "$lib_version" ]]; then + log "NVML library version: ${lib_version}" + fi + + # ── Strategy 1: Kernel module reload ──────────────────────────────────── + # Unload and reload NVIDIA modules so the userspace libs match what loads. + # This is the fastest fix and requires no package changes. + log "Strategy 1: Attempting kernel module reload..." + + # Stop processes using the GPU before module unload + local gpu_containers + gpu_containers="$(docker ps --format '{{.Names}}' --filter 'label=com.docker.compose.project' 2>/dev/null | grep '^dream-' || echo "")" # stderr expected: docker may not be running + if [[ -n "$gpu_containers" ]]; then + log "Stopping Docker containers before module reload..." + # [NON-FATAL: cleanup] Some containers may already be stopped or unresponsive. + docker stop $gpu_containers >> "$LOGFILE" 2>&1 || warn "Some containers failed to stop (non-fatal)" + fi + + # Stop persistence daemon if running + if pgrep -x nvidia-persistenced >/dev/null 2>&1; then # stderr expected: process check + log "Stopping nvidia-persistenced..." + # [NON-FATAL: cleanup] Persistence daemon may have already exited. + kill "$(pgrep -x nvidia-persistenced)" 2>/dev/null || warn "nvidia-persistenced not running (non-fatal)" # stderr expected: may not exist + sleep 1 + fi + + # Kill any remaining GPU processes + if [[ -e /dev/nvidia0 ]]; then + local gpu_pids + gpu_pids="$(fuser /dev/nvidia* 2>/dev/null | xargs || echo "")" # stderr expected: fuser probe + if [[ -n "$gpu_pids" ]]; then + log "Killing GPU processes: ${gpu_pids}" + # [NON-FATAL: cleanup] Some GPU processes may have already exited. + kill $gpu_pids 2>/dev/null || warn "some GPU processes already exited (non-fatal)" # stderr expected: processes may have exited + sleep 2 + fi + fi + + # Unload modules in dependency order + local reload_success=false + # [NON-FATAL: cleanup] Module may not be loaded on this host. + rmmod nvidia_uvm 2>>"$LOGFILE" || warn "nvidia_uvm not loaded (non-fatal)" + # [NON-FATAL: cleanup] Module may not be loaded on this host. + rmmod nvidia_drm 2>>"$LOGFILE" || warn "nvidia_drm not loaded (non-fatal)" + # [NON-FATAL: cleanup] Module may not be loaded on this host. + rmmod nvidia_modeset 2>>"$LOGFILE" || warn "nvidia_modeset not loaded (non-fatal)" + if rmmod nvidia 2>>"$LOGFILE"; then + log "NVIDIA kernel modules unloaded successfully" + # Reload — nvidia-smi triggers automatic module load + sleep 1 + if nvidia-smi &>/dev/null; then # stderr expected: driver reinit + reload_success=true + log "NVIDIA kernel modules reloaded — nvidia-smi works" + nvidia-smi --query-gpu=driver_version,name --format=csv,noheader 2>>"$LOGFILE" | \ + while read -r line; do log " GPU: ${line}"; done + else + warn "nvidia-smi still fails after module reload" + fi + else + warn "Could not unload nvidia module (in use) — trying strategy 2" + fi + + if [[ "$reload_success" == "true" ]]; then + # Verify with DKMS that module version matches kernel expectation + if command -v dkms &>/dev/null; then # stderr expected: dkms check + local dkms_status + dkms_status="$(dkms status 2>/dev/null | grep nvidia || echo "")" # stderr expected: dkms probe + if [[ -n "$dkms_status" ]]; then + log "DKMS status: ${dkms_status}" + fi + fi + + # Restart Docker so it picks up the reloaded driver + # [NON-FATAL: docker] Docker may not be managed by systemctl on Vast.ai. + # [NON-FATAL: docker] Docker may not be managed by systemctl on Vast.ai. + systemctl restart docker 2>>"$LOGFILE" || service docker restart 2>>"$LOGFILE" \ + || warn "Docker restart failed (non-fatal)" + + # Verify CUDA compat libs aren't shadowing host driver inside containers + # (per NVIDIA NIM troubleshooting guide — bundled compat libs at + # /usr/local/cuda-*/compat/ can override the host-mounted driver) + # [NON-FATAL: nvidia-ctk] Toolkit may already be configured or unavailable. + nvidia-ctk runtime configure --runtime=docker 2>>"$LOGFILE" \ + || warn "nvidia-ctk configure failed (non-fatal)" + + # Re-start any containers we stopped + if [[ -n "$gpu_containers" ]]; then + # [NON-FATAL: cleanup] Some containers may fail to restart on driver changes. + docker start $gpu_containers >> "$LOGFILE" 2>&1 || warn "Some containers failed to restart (non-fatal)" + fi + + detect_nvml_mismatch && post_repair_status=0 || post_repair_status=$? + if [[ $post_repair_status -eq 0 ]]; then + _pin_nvidia_packages + return 0 + elif [[ $post_repair_status -eq 1 ]]; then + warn "NVIDIA driver mismatch persists after module reload" + else + warn "Unable to verify NVIDIA driver/library mismatch after module reload" + fi + fi + + # ── Strategy 2: Downgrade userspace to match kernel module ────────────── + # If we know the kernel module version, install matching userspace packages. + if [[ -n "${kernel_version:-}" ]]; then + log "Strategy 2: Aligning userspace libs to kernel module version ${kernel_version}..." + local driver_major + driver_major="$(echo "$kernel_version" | cut -d. -f1)" + + if type -t _wait_for_dpkg_lock >/dev/null 2>&1; then + # [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout. + _wait_for_dpkg_lock 60 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle" + fi + + # Try to install the exact matching driver version + if apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" update -qq 2>>"$LOGFILE" \ + && apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" install -y -qq \ + --allow-downgrades \ + "nvidia-utils-${driver_major}=${kernel_version}-*" \ + "libnvidia-ml-dev=${kernel_version}-*" \ + 2>>"$LOGFILE"; then + log "Userspace libs downgraded to match kernel ${kernel_version}" + if nvidia-smi &>/dev/null; then # stderr expected: driver reinit + log "nvidia-smi works after userspace downgrade" + detect_nvml_mismatch && post_repair_status=0 || post_repair_status=$? + if [[ $post_repair_status -eq 0 ]]; then + _pin_nvidia_packages + return 0 + elif [[ $post_repair_status -eq 1 ]]; then + warn "NVIDIA driver mismatch persists after userspace downgrade" + else + warn "Unable to verify NVIDIA driver/library mismatch after userspace downgrade" + fi + fi + else + warn "Userspace downgrade to ${kernel_version} failed — trying strategy 3" + fi + fi + + # ── Strategy 3: Upgrade everything (original approach) ────────────────── + log "Strategy 3: Attempting full driver upgrade..." + if type -t _wait_for_dpkg_lock >/dev/null 2>&1; then + # [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout. + _wait_for_dpkg_lock 60 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle" + fi + + if apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" update -qq 2>>"$LOGFILE" \ + && apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" install -y -qq \ + --only-upgrade "nvidia-driver-*" 2>>"$LOGFILE"; then + log "NVIDIA driver upgrade completed" + systemctl restart docker 2>>"$LOGFILE" || service docker restart 2>>"$LOGFILE" \ + || warn "Docker restart failed (non-fatal)" + sleep 2 + if nvidia-smi &>/dev/null; then # stderr expected: driver reinit + detect_nvml_mismatch && post_repair_status=0 || post_repair_status=$? + if [[ $post_repair_status -eq 0 ]]; then + log "NVIDIA driver mismatch RESOLVED after upgrade" + _pin_nvidia_packages + return 0 + elif [[ $post_repair_status -eq 1 ]]; then + warn "NVIDIA driver mismatch persists after upgrade" + else + warn "Unable to verify NVIDIA driver/library mismatch after upgrade" + fi + else + warn "nvidia-smi still fails after upgrade" + fi + else + warn "NVIDIA driver upgrade failed" + fi + + warn "All NVML mismatch repair strategies exhausted — GPU may not work" + warn "Manual fix: reboot the instance, or try: rmmod nvidia_uvm nvidia_drm nvidia_modeset nvidia && nvidia-smi" + return 1 +} + +# Pin NVIDIA packages to prevent unattended-upgrades from causing future mismatches +# (NVIDIA support stats: driver mismatches cause 31% of GPU cluster issues) +_pin_nvidia_packages() { + # Hold nvidia packages so unattended-upgrades can't break them + local held=0 + for pkg in $(dpkg -l | grep -E '^ii\s+(nvidia-driver|nvidia-utils|nvidia-dkms|libnvidia)' | awk '{print $2}'); do + apt-mark hold "$pkg" 2>>"$LOGFILE" && held=$((held + 1)) + done + if [[ $held -gt 0 ]]; then + log "Pinned ${held} NVIDIA packages (prevents unattended-upgrades mismatch)" + fi + + # Also blacklist nvidia from unattended-upgrades if config exists + local uu_conf="/etc/apt/apt.conf.d/50unattended-upgrades" + if [[ -f "$uu_conf" ]] && ! grep -q 'nvidia' "$uu_conf"; then + if grep -q 'Unattended-Upgrade::Package-Blacklist' "$uu_conf"; then + # [NON-FATAL: apt] Blacklist update is best-effort; mismatches are handled elsewhere. + sed -i '/Unattended-Upgrade::Package-Blacklist/a\ "nvidia-*";' "$uu_conf" 2>>"$LOGFILE" \ + || warn "Failed to add nvidia to unattended-upgrades blacklist (non-fatal)" + log "Added nvidia-* to unattended-upgrades blacklist" + fi + fi +} + +# ── Post-install fix orchestrator ─────────────────────────────────────────── +# Called by phases/05, subcommands/fix, subcommands/resume. +# Coordinates all post-install fixes in correct order. +apply_post_install_fixes() { + local ds_dir="$1" + local gpu_backend="${2:-auto}" + local data_dir="${ds_dir}/data" + local env_file="${ds_dir}/.env" + local cpu_count docker_cpu compose_ceiling max_cpu + cpu_count=$(nproc 2>>"$LOGFILE" || echo 1) + docker_cpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "unknown") + + [[ "$gpu_backend" == "auto" ]] && gpu_backend=$(detect_gpu_backend) + + # Docker group membership + if getent group docker &>/dev/null; then + # [NON-FATAL: permissions] User can still run with sudo or log in again. + usermod -aG docker "$DREAM_USER" || warn "docker group add failed (non-fatal)" + fi + + # CPU limit fix — cap any cpus: value that exceeds (nproc - 1). + # Always run: cheap no-op on files whose values already fit. + compose_ceiling=$(get_compose_cpu_ceiling) + max_cpu=$(compute_safe_cpu_cap) + cap_cpu_in_yaml "$ds_dir" "$max_cpu" + log "CPU limits capped to ${max_cpu} (nproc=${cpu_count}, docker=${docker_cpu}, ceiling=${compose_ceiling})" + + # Keep env-substituted CPU limits safe for overlays that use + # ${LLAMA_CPU_LIMIT:-...} syntax. + if [[ -f "$env_file" ]]; then + local llama_limit="${max_cpu}.0" + local llama_reservation="2.0" + if [[ "$max_cpu" -lt 2 ]]; then + llama_reservation="1.0" + fi + env_set "$env_file" "LLAMA_CPU_LIMIT" "$llama_limit" + env_set "$env_file" "LLAMA_CPU_RESERVATION" "$llama_reservation" + log "LLAMA CPU env caps set to limit=${llama_limit}, reservation=${llama_reservation}" + fi + + _apply_permission_fixes "$ds_dir" "$data_dir" "$gpu_backend" + _apply_compatibility_fixes "$ds_dir" + _apply_env_defaults "$ds_dir" "$env_file" "$data_dir" + ensure_dream_cli_command "$ds_dir" + _cap_context_for_vram "$ds_dir" + + # ── [FIX: nvml-mismatch] Post-install NVIDIA driver check (fallback) ────── + if [[ "$gpu_backend" == "nvidia" ]]; then + log "Checking for NVIDIA driver/library version alignment (post-install)..." + if detect_nvml_mismatch; then + : + else + mismatch_status=$? + if [[ $mismatch_status -eq 1 ]]; then + warn "NVIDIA driver/library mismatch detected post-install (non-fatal)" + warn "Run 'bash setup.sh --fix' to repair, or manually upgrade nvidia-driver-*" + elif [[ $mismatch_status -eq 2 ]]; then + local host_probe_output + # [NON-FATAL: probe] NVML probe may fail on transient driver issues. + host_probe_output=$(nvidia-smi 2>&1) || warn "nvidia-smi probe failed (non-fatal)" + if _has_nvml_mismatch_signature "$host_probe_output"; then + warn "Host NVIDIA stack reports driver/library mismatch (non-fatal)" + warn "If 'bash setup.sh --fix' cannot recover, reinstall NVIDIA driver package and reboot" + fi + fi + fi + fi + + log "Post-install fixes applied (including ACL-based permission system)" +} + +_apply_permission_fixes() { + local ds_dir="$1" data_dir="$2" gpu_backend="$3" + ensure_acl_tools + precreate_extension_data_dirs "$ds_dir" + apply_data_acl "$data_dir" + fix_known_uid_requirements "$data_dir" "$gpu_backend" + configure_dream_umask + create_permission_fix_script "$ds_dir" + apply_data_acl "${ds_dir}/extensions" + if [[ -d "${ds_dir}/user-extensions" ]]; then + apply_data_acl "${ds_dir}/user-extensions" + fi + # [NON-FATAL: scripts] Missing exec bits only affects helper scripts. + find "${ds_dir}/scripts" -name "*.sh" -exec chmod +x {} + || warn "chmod scripts failed (non-fatal)" + mkdir -p "${ds_dir}/logs" + apply_data_acl "${ds_dir}/logs" +} + +_apply_compatibility_fixes() { + local ds_dir="$1" + ensure_whisper_ui_compatibility "$ds_dir" + ensure_webui_stt_model_alignment "$ds_dir" + patch_openclaw_inject_token_runtime "$ds_dir" +} + +_apply_env_defaults() { + local ds_dir="$1" env_file="$2" data_dir="$3" + + # Seed .env from .env.example if missing (fatal if fails — compose cannot start without all required variables) + if [[ ! -f "$env_file" ]]; then + local env_example="${ds_dir}/.env.example" + if [[ -f "$env_example" ]]; then + cp "$env_example" "$env_file" || { + err ".env.example copy to ${env_file} failed — Docker Compose cannot start" + exit 1 + } + chown "${DREAM_USER}:${DREAM_USER}" "$env_file" || { + err ".env ownership fix after copy failed — Docker Compose cannot start" + exit 1 + } + chmod 0660 "$env_file" || { + err ".env chmod to 0660 after copy failed — Docker Compose cannot start" + exit 1 + } + log "Seeded .env from .env.example" + else + log "No .env.example found; will create .env via env_set()" + fi + fi + + # Fix .env ownership and permissions if file exists (fatal if fails — compose cannot start without readable .env) + if [[ -f "$env_file" ]]; then + # Check and fix ownership independently + if [[ "$(stat -c '%U' "$env_file" 2>>"$LOGFILE" || echo root)" != "${DREAM_USER}" ]]; then + chown "${DREAM_USER}:${DREAM_USER}" "$env_file" || { + err ".env ownership fix failed — Docker Compose cannot start" + exit 1 + } + fi + # Check and fix mode independently + if [[ "$(stat -c '%a' "$env_file" 2>>"$LOGFILE")" != "660" ]]; then + chmod 0660 "$env_file" || { + err ".env chmod to 0660 failed — Docker Compose cannot start" + exit 1 + } + fi + fi + + # Helper: Replace CHANGEME or empty with generated secret/value + _replace_changeme() { + local key="$1" value="$2" + local current="$(env_get "$env_file" "$key")" + if [[ -z "$current" || "$current" == "CHANGEME" ]]; then + env_set "$env_file" "$key" "$value" + log "Set ${key}" + fi + } + + # Generate or replace hard-required secrets (compose uses ${VAR:?error} syntax) + _replace_changeme "WEBUI_SECRET" "$(openssl rand -hex 32)" + _replace_changeme "SEARXNG_SECRET" "$(openssl rand -hex 32)" + _replace_changeme "LITELLM_KEY" "sk-dream-$(openssl rand -hex 16)" + _replace_changeme "N8N_PASS" "$(openssl rand -hex 16)" + _replace_changeme "LIVEKIT_API_KEY" "$(openssl rand -hex 16)" + _replace_changeme "LIVEKIT_API_SECRET" "$(openssl rand -hex 32)" + _replace_changeme "DIFY_SECRET_KEY" "$(openssl rand -hex 32)" + _replace_changeme "OPENCODE_SERVER_PASSWORD" "$(openssl rand -hex 16)" + + # Set non-secret required variables (also checked by compose) + _replace_changeme "N8N_USER" "admin@dreamserver.local" + _replace_changeme "OPENCLAW_TOKEN" "$(openssl rand -hex 24)" + _replace_changeme "DASHBOARD_API_KEY" "$(openssl rand -hex 24)" + + # GGUF_FILE — detect from data/models if not set + if [[ -z "$(env_get "$env_file" "GGUF_FILE")" ]]; then + local first_model + first_model=$(find "${data_dir}/models/" -maxdepth 1 -name "*.gguf" -type f \ + -printf '%s %f\n' 2>&1 | sort -rn | head -1 | cut -d' ' -f2- || echo "") + if [[ -n "$first_model" ]]; then + env_set "$env_file" "GGUF_FILE" "$first_model" + log "Set GGUF_FILE=${first_model}" + fi + fi +} + +# ── VRAM-aware context size capping ─────────────────────────────────────── +# The upstream installer sets CTX_SIZE=131072 when Hermes is enabled, but +# this exceeds VRAM on cards <=24 GB with large models. Cap CTX_SIZE based +# on available VRAM headroom after model weight, and enable KV cache +# quantization to maximize usable context within the budget. +_cap_context_for_vram() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + + # Skip if no GPU + if [[ "${GPU_BACKEND:-cpu}" == "cpu" ]]; then + return 0 + fi + + local vram_mb="${GPU_VRAM:-0}" + local per_gpu_vram_mb="${GPU_VRAM:-0}" + local model_size_per_gpu_mb=0 + local current_ctx model_size_mb headroom_mb safe_ctx kv_quant + + # Multi-GPU: cap by per-GPU VRAM budget to avoid CUDA0 OOM + if [[ "${GPU_COUNT:-1}" -ge 2 && "${GPU_TOTAL_VRAM:-0}" -gt 0 ]]; then + per_gpu_vram_mb=$(( GPU_TOTAL_VRAM / GPU_COUNT )) + if [[ "${GPU_VRAMS+set}" == "set" && "${#GPU_VRAMS[@]}" -gt 0 ]]; then + local min_vram="${GPU_VRAMS[0]}" + local vram + for vram in "${GPU_VRAMS[@]}"; do + if [[ "$vram" -lt "$min_vram" ]]; then + min_vram="$vram" + fi + done + per_gpu_vram_mb="$min_vram" + fi + fi + + current_ctx="$(env_get "$env_file" "CTX_SIZE")" + current_ctx="${current_ctx:-16384}" + + # Get model size from .env or fallback to TIER_MODEL_SIZE_MB + model_size_mb="$(env_get "$env_file" "LLM_MODEL_SIZE_MB")" + model_size_mb="${model_size_mb:-${TIER_MODEL_SIZE_MB:-0}}" + + if [[ "$per_gpu_vram_mb" -eq 0 || "$model_size_mb" -eq 0 ]]; then + log "VRAM or model size unknown -- skipping context cap" + return 0 + fi + + # Split model weight across GPUs when available; fall back to full size on single GPU. + if [[ "${GPU_COUNT:-1}" -ge 2 ]]; then + model_size_per_gpu_mb=$(( (model_size_mb + GPU_COUNT - 1) / GPU_COUNT )) + else + model_size_per_gpu_mb="$model_size_mb" + fi + + # Calculate per-GPU headroom (VRAM - model weight per GPU - 1 GB overhead) + headroom_mb=$(( per_gpu_vram_mb - model_size_per_gpu_mb - 1024 )) + + if [[ $headroom_mb -le 0 ]]; then + # Model barely fits -- use minimum context + safe_ctx=2048 + kv_quant="q4_0" + warn "Model (${model_size_mb}MB) nearly exceeds GPU VRAM (${per_gpu_vram_mb}MB) -- setting CTX_SIZE=${safe_ctx}" + elif [[ $headroom_mb -le 2048 ]]; then + # ~2 GB headroom + safe_ctx=4096 + kv_quant="q4_0" + elif [[ $headroom_mb -le 4096 ]]; then + # ~4 GB headroom (typical RTX 3090 with 18.6 GB model) + safe_ctx=16384 + kv_quant="q8_0" + elif [[ $headroom_mb -le 8192 ]]; then + # ~8 GB headroom + safe_ctx=32768 + kv_quant="q8_0" + elif [[ $headroom_mb -le 16384 ]]; then + # ~16 GB headroom (e.g., RTX 4090 with smaller model) + safe_ctx=65536 + kv_quant="q8_0" + else + # >16 GB headroom -- large GPU, let it run + safe_ctx=131072 + kv_quant="f16" + fi + + if [[ "$current_ctx" -gt "$safe_ctx" ]]; then + log "VRAM budget per GPU: ${per_gpu_vram_mb}MB, model per GPU: ${model_size_per_gpu_mb}MB, headroom: ${headroom_mb}MB" + log "Capping CTX_SIZE: ${current_ctx} -> ${safe_ctx} (prevents OOM on ${per_gpu_vram_mb}MB GPU)" + env_set "$env_file" "CTX_SIZE" "$safe_ctx" + + # Set KV cache quantization to maximize context within VRAM budget + local current_kv_k current_kv_v + current_kv_k="$(env_get "$env_file" "LLAMA_ARG_CACHE_TYPE_K")" + current_kv_v="$(env_get "$env_file" "LLAMA_ARG_CACHE_TYPE_V")" + + if [[ "${current_kv_k:-f16}" == "f16" && "$kv_quant" != "f16" ]]; then + env_set "$env_file" "LLAMA_ARG_CACHE_TYPE_K" "$kv_quant" + env_set "$env_file" "LLAMA_ARG_CACHE_TYPE_V" "$kv_quant" + log "KV cache quantization: f16 -> ${kv_quant} (reduces VRAM, trades some quality)" + fi + else + log "CTX_SIZE=${current_ctx} fits within VRAM budget (${headroom_mb}MB headroom) -- no change" + fi + + _cap_batch_for_vram "$env_file" "$per_gpu_vram_mb" "$safe_ctx" +} + +# ── VRAM-aware batch size capping ───────────────────────────────────────── +# Prevent compute buffer OOM on multi-GPU by bounding batch size per GPU. +_cap_batch_for_vram() { + local env_file="$1" vram_mb="$2" ctx_size="$3" + local current_batch safe_batch + + current_batch="$(env_get "$env_file" "LLAMA_BATCH_SIZE")" + current_batch="${current_batch:-2048}" + + if [[ "$vram_mb" -le 12288 ]]; then + safe_batch=256 + elif [[ "$vram_mb" -le 16384 ]]; then + safe_batch=512 + elif [[ "$vram_mb" -le 24576 ]]; then + safe_batch=1024 + else + safe_batch=2048 + fi + + if [[ "$ctx_size" -ge 65536 && "$safe_batch" -gt 512 ]]; then + safe_batch=512 + elif [[ "$ctx_size" -ge 32768 && "$safe_batch" -gt 1024 ]]; then + safe_batch=1024 + fi + + if [[ ! "$current_batch" =~ ^[0-9]+$ ]]; then + env_set "$env_file" "LLAMA_BATCH_SIZE" "$safe_batch" + log "LLAMA_BATCH_SIZE invalid ('${current_batch}') -- set to ${safe_batch}" + return 0 + fi + + if [[ "$current_batch" -gt "$safe_batch" ]]; then + env_set "$env_file" "LLAMA_BATCH_SIZE" "$safe_batch" + log "Capping LLAMA_BATCH_SIZE: ${current_batch} -> ${safe_batch} (prevents CUDA OOM)" + else + log "LLAMA_BATCH_SIZE=${current_batch} fits within VRAM budget -- no change" + fi +} diff --git a/dream-server/installers/p2p-gpu/lib/gpu-topology.sh b/dream-server/installers/p2p-gpu/lib/gpu-topology.sh new file mode 100644 index 000000000..29a695338 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/gpu-topology.sh @@ -0,0 +1,366 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Topology & Multi-GPU Assignment +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/lib/ +# Purpose: Per-GPU enumeration, topology detection (NVLink/PCIe), +# GPU-to-service assignment delegation, env var writing +# +# Expects: GPU_BACKEND, GPU_COUNT, GPU_VRAM, LOGFILE, +# log(), warn(), err(), env_set(), env_get() +# Provides: enumerate_gpus(), generate_topology_json(), +# run_gpu_assignment() +# +# Modder notes: +# All functions are no-ops when GPU_COUNT < 2. Single-GPU path is untouched. +# Prefers upstream assign_gpus.py + nvidia-topo.sh when available; +# built-in fallback handles pre-clone state. +# GPU_UUIDS, GPU_VRAMS, GPU_NAMES are indexed arrays (not associative). +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# ── Per-GPU enumeration ────────────────────────────────────────────────────── +# Populates: GPU_UUIDS[], GPU_VRAMS[] (MiB each), GPU_NAMES[], GPU_TOTAL_VRAM +enumerate_gpus() { + [[ "${GPU_COUNT:-0}" -lt 2 ]] && return 0 + + GPU_UUIDS=() + GPU_VRAMS=() + GPU_NAMES=() + GPU_TOTAL_VRAM=0 + + if [[ "${GPU_BACKEND:-}" == "nvidia" ]]; then + while IFS=', ' read -r uuid vram name; do + [[ -z "$uuid" ]] && continue + GPU_UUIDS+=("$uuid") + GPU_VRAMS+=("${vram%%.*}") # truncate decimals + GPU_NAMES+=("$name") + GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + ${vram%%.*} )) + # [NON-FATAL: probe] Topology is best-effort; fallback uses env values. + done < <(nvidia-smi --query-gpu=gpu_uuid,memory.total,name \ + --format=csv,noheader,nounits 2>>"$LOGFILE" || warn "nvidia-smi GPU enumeration failed (non-fatal)") + + elif [[ "${GPU_BACKEND:-}" == "amd" ]]; then + local idx=0 + while IFS= read -r line; do + [[ -z "$line" ]] && continue + local gpu_name + gpu_name=$(rocm-smi -d "$idx" --showproductname 2>>"$LOGFILE" \ + | grep -oP 'Card series:\s*\K.*' || echo "AMD GPU $idx") + local vram_bytes + vram_bytes=$(rocm-smi -d "$idx" --showmeminfo vram 2>>"$LOGFILE" \ + | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' || echo "0") + local vram_mb=$(( vram_bytes / 1048576 )) + [[ $vram_mb -lt 1000 ]] && vram_mb=${GPU_VRAM:-0} # fallback + + GPU_UUIDS+=("AMD-GPU-${idx}") + GPU_VRAMS+=("$vram_mb") + GPU_NAMES+=("$gpu_name") + GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + vram_mb )) + idx=$((idx + 1)) + done < <(rocm-smi --showid 2>>"$LOGFILE" | grep 'GPU\[' || echo "") + fi + + # Sanity: if enumeration failed, fall back to count * per-GPU + if [[ ${#GPU_UUIDS[@]} -eq 0 ]]; then + GPU_TOTAL_VRAM=$(( ${GPU_VRAM:-0} * ${GPU_COUNT:-1} )) + warn "GPU enumeration failed — estimated total VRAM: ${GPU_TOTAL_VRAM} MiB" + fi +} + +# ── Topology JSON generation ───────────────────────────────────────────────── +# Builds JSON matching upstream assign_gpus.py input schema. +# Args: $1 = output file path +generate_topology_json() { + local output_file="$1" + [[ "${GPU_COUNT:-0}" -lt 2 ]] && return 0 + + # Strategy 1: Use upstream nvidia-topo.sh if cloned + if [[ -n "${DS_DIR:-}" && -f "${DS_DIR}/installers/lib/nvidia-topo.sh" \ + && "${GPU_BACKEND:-}" == "nvidia" ]]; then + local upstream_topo + upstream_topo=$( + # Source upstream functions in subshell + warn() { echo "WARN: $*" >&2; } + err() { echo "ERR: $*" >&2; } + source "${DS_DIR}/installers/lib/nvidia-topo.sh" 2>>"$LOGFILE" + detect_nvidia_topo 2>>"$LOGFILE" + ) || upstream_topo="" + if [[ -n "$upstream_topo" && "$upstream_topo" != "{}" ]]; then + echo "$upstream_topo" > "$output_file" + log "Topology generated via upstream nvidia-topo.sh" + return 0 + fi + fi + + # Strategy 2: Built-in — enumerate GPUs + parse topo matrix + _generate_builtin_topology "$output_file" +} + +_generate_builtin_topology() { + local output_file="$1" + + # Build gpus array + local gpus_json="[" + for i in "${!GPU_UUIDS[@]}"; do + local mem_gb + mem_gb=$(awk "BEGIN {printf \"%.1f\", ${GPU_VRAMS[$i]} / 1024}") + [[ $i -gt 0 ]] && gpus_json+="," + gpus_json+="{\"index\":${i},\"uuid\":\"${GPU_UUIDS[$i]}\",\"name\":\"${GPU_NAMES[$i]}\",\"memory_gb\":${mem_gb}}" + done + gpus_json+="]" + + # Build links array from nvidia-smi topo -m + local links_json="[]" + if [[ "${GPU_BACKEND:-}" == "nvidia" ]]; then + links_json=$(_parse_nvidia_topo_links) + fi + + cat > "$output_file" << TOPO_EOF +{ + "vendor": "${GPU_BACKEND:-unknown}", + "gpu_count": ${#GPU_UUIDS[@]}, + "gpus": ${gpus_json}, + "links": ${links_json} +} +TOPO_EOF + + log "Topology generated (built-in): ${#GPU_UUIDS[@]} GPUs" +} + +_parse_nvidia_topo_links() { + # Parse nvidia-smi topo -m matrix into JSON links array + local matrix + matrix=$(nvidia-smi topo -m 2>>"$LOGFILE") || { echo "[]"; return; } + + # Strip ANSI escape codes + matrix=$(echo "$matrix" | sed 's/\x1b\[[0-9;]*m//g') + + local header_line + header_line=$(echo "$matrix" | grep -E '^\s+GPU[0-9]' | head -1 || echo "") + [[ -z "$header_line" ]] && { echo "[]"; return; } + + local -a headers + read -ra headers <<< "$header_line" + + local links="[" + local first=true + + while IFS= read -r line; do + [[ "$line" =~ ^[[:space:]] ]] && continue + [[ -z "$line" ]] && continue + local row_label + row_label=$(echo "$line" | awk '{print $1}') + [[ "$row_label" =~ ^GPU[0-9]+$ ]] || continue + local gpu_a="${row_label#GPU}" + local -a cells + read -ra cells <<< "$line" + + for col_idx in "${!headers[@]}"; do + local col_header="${headers[$col_idx]}" + [[ "$col_header" =~ ^GPU[0-9]+$ ]] || continue + local gpu_b="${col_header#GPU}" + [[ "$gpu_a" -ge "$gpu_b" ]] && continue # upper triangle only + + local cell_idx=$(( col_idx + 1 )) # +1 for row label + local link_type="${cells[$cell_idx]:-X}" + [[ "$link_type" == "X" ]] && continue # self + + local rank + rank=$(_link_rank "$link_type") + local label + label=$(_link_label "$link_type") + + [[ "$first" != "true" ]] && links+="," + first=false + links+="{\"gpu_a\":${gpu_a},\"gpu_b\":${gpu_b},\"link_type\":\"${link_type}\",\"link_label\":\"${label}\",\"rank\":${rank}}" + done + done <<< "$matrix" + + links+="]" + echo "$links" +} + +# Link rank/label matching upstream nvidia-topo.sh +_link_rank() { + case "$1" in + NV4|NV6|NV8|NV12|NV18) echo 100 ;; + XGMI|XGMI2) echo 90 ;; + NV1|NV2|NV3) echo 80 ;; + MIG) echo 70 ;; + PIX) echo 50 ;; + PXB) echo 40 ;; + PHB) echo 30 ;; + NODE) echo 20 ;; + SYS|SOC) echo 10 ;; + *) echo 0 ;; + esac +} + +_link_label() { + case "$1" in + NV*) echo "NVLink" ;; + XGMI*) echo "InfinityFabric" ;; + MIG) echo "MIG-SameDie" ;; + PIX) echo "PCIe-SameSwitch" ;; + PXB) echo "PCIe-CrossSwitch" ;; + PHB) echo "PCIe-HostBridge" ;; + NODE) echo "SameNUMA-NoBridge" ;; + SYS|SOC) echo "CrossNUMA" ;; + *) echo "Unknown" ;; + esac +} + +# ── GPU-to-service assignment ───────────────────────────────────────────────── +# Args: $1 = ds_dir, $2 = env_file +run_gpu_assignment() { + local ds_dir="$1" env_file="$2" + [[ "${GPU_COUNT:-0}" -lt 2 ]] && return 0 + + if [[ "${GPU_UUIDS+set}" != "set" ]]; then + enumerate_gpus + elif [[ "${#GPU_UUIDS[@]}" -eq 0 ]]; then + enumerate_gpus + fi + + local topo_file="/tmp/ds-gpu-topo-$$.json" + generate_topology_json "$topo_file" + [[ ! -f "$topo_file" ]] && { warn "Topology file not generated — skipping assignment"; return 0; } + + local model_size_mb + model_size_mb=$(env_get "$env_file" "LLM_MODEL_SIZE_MB") + model_size_mb="${model_size_mb:-${TIER_MODEL_SIZE_MB:-5760}}" + + local assign_script="${ds_dir}/scripts/assign_gpus.py" + local result="" + + # Strategy 1: Upstream assign_gpus.py + if [[ -f "$assign_script" ]] && command -v python3 &>/dev/null; then + result=$(python3 "$assign_script" \ + --topology "$topo_file" \ + --model-size "$model_size_mb" 2>&1) || { + warn "assign_gpus.py failed: ${result}" + result="" + } + fi + + if [[ -n "$result" ]] && echo "$result" | jq -e '.gpu_assignment' &>/dev/null; then + _write_assignment_from_json "$result" "$env_file" + log "GPU assignment via upstream assign_gpus.py" + else + # Strategy 2: Built-in fallback — all GPUs to llama + _write_builtin_assignment "$env_file" + log "GPU assignment via built-in fallback (all GPUs → llama)" + fi + + # Save topology for dashboard-api + mkdir -p "${ds_dir}/config" + # [NON-FATAL: telemetry] Topology persistence only aids dashboard visibility. + cp "$topo_file" "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to persist gpu-topology.json (non-fatal)" + # [NON-FATAL: telemetry] Topology persistence only aids dashboard visibility. + chmod 644 "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to set mode on gpu-topology.json (non-fatal)" + + # Enable P2P transfers when NVLink detected (avoids host RAM round-trip) + if [[ -f "$topo_file" ]] && jq -e '.links[] | select(.link_type | startswith("NV"))' "$topo_file" &>/dev/null; then + env_set "$env_file" "GGML_CUDA_P2P" "1" + log "NVLink detected — enabled GGML_CUDA_P2P for direct GPU-to-GPU transfers" + fi + + rm -f "$topo_file" +} + +_map_llama_split_mode() { + case "${1:-}" in + ""|none|null) echo "none" ;; + tensor|hybrid) echo "row" ;; + pipeline) echo "layer" ;; + layer|row) echo "$1" ;; + *) + warn "Unknown split mode '${1}' from assign_gpus.py; defaulting to layer" + echo "layer" + ;; + esac +} + +_ensure_numeric_main_gpu() { + local env_file="$1" split_mode="$2" + local main_gpu + main_gpu="$(env_get "$env_file" "LLAMA_ARG_MAIN_GPU")" + if [[ -z "$main_gpu" || ! "$main_gpu" =~ ^[0-9]+$ ]]; then + if [[ -n "$main_gpu" ]]; then + warn "Invalid LLAMA_ARG_MAIN_GPU='${main_gpu}' — resetting to 0" + fi + if [[ "$split_mode" != "none" ]]; then + env_set "$env_file" "LLAMA_ARG_MAIN_GPU" "0" + fi + fi +} + +_write_assignment_from_json() { + local json="$1" env_file="$2" + + local llama_uuids split_mode tensor_split + llama_uuids=$(echo "$json" | jq -r '.gpu_assignment.services.llama_server.gpus // [] | join(",")') || llama_uuids="" + split_mode=$(echo "$json" | jq -r '.gpu_assignment.services.llama_server.parallelism.mode // "none"') || split_mode="none" + split_mode=$(_map_llama_split_mode "$split_mode") + tensor_split=$(echo "$json" | jq -r ' + .gpu_assignment.services.llama_server as $svc | + if $svc.parallelism.tensor_split then ($svc.parallelism.tensor_split | map(tostring) | join(",")) + else "" end') || tensor_split="" + + [[ -n "$llama_uuids" ]] && env_set "$env_file" "LLAMA_SERVER_GPU_UUIDS" "$llama_uuids" + env_set "$env_file" "LLAMA_ARG_SPLIT_MODE" "$split_mode" + [[ -n "$tensor_split" ]] && env_set "$env_file" "LLAMA_ARG_TENSOR_SPLIT" "$tensor_split" + + local main_gpu + main_gpu=$(echo "$json" | jq -r '.gpu_assignment.services.llama_server.parallelism.main_gpu_index // empty') || main_gpu="" + if [[ "$main_gpu" =~ ^[0-9]+$ ]]; then + env_set "$env_file" "LLAMA_ARG_MAIN_GPU" "$main_gpu" + fi + _ensure_numeric_main_gpu "$env_file" "$split_mode" + + # Per-service GPU UUIDs + local svc uuid + for svc in whisper comfyui embeddings; do + uuid=$(echo "$json" | jq -r ".gpu_assignment.services.${svc}.gpus[0]? // empty") || uuid="" + local env_key + case "$svc" in + whisper) env_key="WHISPER_GPU_UUID" ;; + comfyui) env_key="COMFYUI_GPU_UUID" ;; + embeddings) env_key="EMBEDDINGS_GPU_UUID" ;; + esac + [[ -n "$uuid" && "$uuid" != "null" ]] && env_set "$env_file" "$env_key" "$uuid" + done + + env_set "$env_file" "GPU_COUNT" "${GPU_COUNT}" + log "Multi-GPU env vars written: llama=[${llama_uuids}] mode=${split_mode}" +} + +_write_builtin_assignment() { + local env_file="$1" + + # All GPUs → llama-server with pipeline parallelism + local all_uuids="" + for uuid in "${GPU_UUIDS[@]}"; do + [[ -n "$all_uuids" ]] && all_uuids+="," + all_uuids+="$uuid" + done + + # VRAM-proportional tensor_split + local split="" + for vram in "${GPU_VRAMS[@]}"; do + [[ -n "$split" ]] && split+="," + split+="$vram" + done + + [[ -n "$all_uuids" ]] && env_set "$env_file" "LLAMA_SERVER_GPU_UUIDS" "$all_uuids" + env_set "$env_file" "LLAMA_ARG_SPLIT_MODE" "layer" + [[ -n "$split" ]] && env_set "$env_file" "LLAMA_ARG_TENSOR_SPLIT" "$split" + env_set "$env_file" "GPU_COUNT" "${GPU_COUNT}" + _ensure_numeric_main_gpu "$env_file" "layer" + + log "Built-in assignment: all ${GPU_COUNT} GPUs → llama, mode=layer, split=${split}" +} diff --git a/dream-server/installers/p2p-gpu/lib/logging.sh b/dream-server/installers/p2p-gpu/lib/logging.sh new file mode 100644 index 000000000..9e955b357 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/logging.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Logging & Lifecycle +# ============================================================================ +# Part of: p2p-gpu/lib/ +# Purpose: Log/warn/err/step functions, timestamp helper, cleanup trap, +# flock-based lock acquisition +# +# Expects: LOGFILE, LOCKFILE, RED, GREEN, YELLOW, CYAN, BOLD, NC +# Provides: _ts(), log(), warn(), err(), step(), setup_cleanup_trap(), +# acquire_lock() +# +# Modder notes: +# Log writes use append-or-silent ( || : ) to avoid infinite recursion +# +# if the logfile itself is unwritable. This is the ONE intentional +# +# deviation from CLAUDE.md §4's "never || true" rule: the logging +# +# functions ARE the warn() path, so they cannot call warn() on their +# +# own failure without recursing. The 4 uses below are the only || : +# +# in the entire toolkit. +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +_ts() { date '+%Y-%m-%d %H:%M:%S'; } + +log() { + echo -e "${GREEN}[✓]${NC} $*" + echo "$(_ts) [INFO] $*" >> "$LOGFILE" || : +} + +warn() { + echo -e "${YELLOW}[!]${NC} $*" + echo "$(_ts) [WARN] $*" >> "$LOGFILE" || : +} + +err() { + echo -e "${RED}[✗]${NC} $*" >&2 + echo "$(_ts) [ERROR] $*" >> "$LOGFILE" || : +} + +step() { + echo -e "\n${CYAN}${BOLD}━━━ $* ━━━${NC}\n" + echo "$(_ts) [STEP] $*" >> "$LOGFILE" || : +} + +# ── Cleanup trap ──────────────────────────────────────────────────────────── +setup_cleanup_trap() { + _vastai_cleanup() { + local exit_code=$? + if [[ $exit_code -ne 0 ]]; then + err "Script failed at line ${BASH_LINENO[0]:-unknown} (exit code: ${exit_code})" + err "Full log: ${LOGFILE}" + err "Last 10 lines:" + tail -10 "$LOGFILE" 2>&1 | sed 's/^/ /' || warn "could not read log tail" + echo "" + echo -e "${YELLOW}${BOLD} What to try next:${NC}" + echo -e " ${BOLD}bash $0 --fix${NC} Apply fixes and restart services" + echo -e " ${BOLD}bash $0 --resume${NC} Quick restart (skip install phases)" + echo -e " ${BOLD}bash $0 --status${NC} Check what's actually running" + echo "" + fi + # Release flock (fd 9 auto-closes on exit) + exit "$exit_code" + } + trap _vastai_cleanup EXIT + trap 'err "Interrupted by signal"; exit 130' INT TERM HUP +} + +# ── Flock-based lock ──────────────────────────────────────────────────────── +acquire_lock() { + exec 9>"$LOCKFILE" + if ! flock -n 9; then + err "Another instance is already running." + echo -e " ${YELLOW}Wait for it to finish, or force remove:${NC} rm ${LOCKFILE}" + exit 1 + fi +} + +# ── dpkg lock helper (used by phases 00 and 01) ───────────────────────────── +# Waits for the dpkg frontend lock to be released, killing unattended-upgrades +# if it's the holder. Returns 0 when lock is free, 1 on timeout. +_wait_for_dpkg_lock() { + local max_wait="${1:-90}" + + if ! fuser /var/lib/dpkg/lock-frontend &>/dev/null; then # stderr expected: fuser probe + return 0 # Lock is free + fi + + log "dpkg lock held — attempting to release (timeout ${max_wait}s)" + + # Stop unattended-upgrades if it's the culprit + if ps aux | grep -q "[u]nattended-upgrade"; then + log "Stopping unattended-upgrades service..." + systemctl stop unattended-upgrades 2>>"$LOGFILE" || warn "systemctl stop failed (non-fatal)" + # Also kill any lingering child processes + pkill -f unattended-upgrade 2>/dev/null || warn "no unattended-upgrade process found (non-fatal)" # stderr expected: no matching process + fi + + # Poll until lock is released + local elapsed=0 + while fuser /var/lib/dpkg/lock-frontend &>/dev/null; do # stderr expected: fuser probe + if [[ $elapsed -ge $max_wait ]]; then + warn "dpkg lock still held after ${max_wait}s — proceeding with DPkg::Lock::Timeout" + return 1 + fi + sleep 3 + elapsed=$((elapsed + 3)) + (( elapsed % 15 == 0 )) && log "Still waiting for dpkg lock... (${elapsed}s / ${max_wait}s)" + done + + log "dpkg lock released after ${elapsed}s" + + # Clean up any interrupted package state + if ! dpkg --configure -a 2>>"$LOGFILE"; then + warn "dpkg --configure -a failed (non-fatal) — DPkg::Lock::Timeout will handle" + fi + + return 0 +} diff --git a/dream-server/installers/p2p-gpu/lib/models.sh b/dream-server/installers/p2p-gpu/lib/models.sh new file mode 100644 index 000000000..0fd4d1f4b --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/models.sh @@ -0,0 +1,440 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Model Management +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/lib/ +# Purpose: Model URL resolution, aria2c-optimized downloads, model swap +# watcher for background upgrades, disk-space gating +# +# Expects: LOGFILE, PIDFILE_DIR, log(), warn(), env_get(), env_set() +# Provides: resolve_model_url(), optimize_model_download(), +# create_model_swap_watcher(), check_disk_for_download() +# +# Modder notes: +# resolve_model_url tries 4 strategies in priority order: +# 1. model-upgrade log 2. upstream tier-map.sh +# 3. backend JSON configs 4. HuggingFace org probing +# create_model_swap_watcher generates a self-contained script that polls +# for aria2c completion and hot-swaps the active model. +# PIDs are tracked in PIDFILE_DIR for safe cleanup (no pkill -f). +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# ── GPU-aware tier model resolution ─────────────────────────────────────────── +# Maps GPU VRAM (MB) to the optimal tier model. Mirrors the upstream tier-map +# logic from dream-server/installers/lib/tier-map.sh but is self-contained so +# p2p-gpu stays isolated from the core codebase. +# +# If the upstream tier-map.sh exists (after DreamServer is cloned), we source it +# directly for accuracy. Otherwise, fall back to a built-in VRAM lookup table. +# +# Sets: TIER_GGUF_FILE, TIER_GGUF_URL, TIER_MODEL_SIZE_MB +# Args: $1 = ds_dir, $2 = gpu_backend, $3 = gpu_vram_mb, $4 = gpu_count +resolve_tier_for_gpu() { + local ds_dir="$1" gpu_backend="$2" vram_mb="${3:-0}" gpu_count="${4:-1}" + local tier_map="${ds_dir}/installers/lib/tier-map.sh" + + local total_vram_mb="${GPU_TOTAL_VRAM:-$(( vram_mb * gpu_count ))}" + local reserve_mb_per_gpu="${P2P_TIER_VRAM_RESERVE_MB:-1024}" + local effective_vram_mb="$vram_mb" + if [[ "$gpu_count" -ge 2 ]]; then + local reserve_total=$(( reserve_mb_per_gpu * gpu_count )) + if [[ "$total_vram_mb" -gt "$reserve_total" ]]; then + effective_vram_mb=$(( total_vram_mb - reserve_total )) + else + effective_vram_mb="$total_vram_mb" + fi + log "Tier VRAM budget: per_gpu=${vram_mb}MB total=${total_vram_mb}MB reserve=${reserve_mb_per_gpu}MB x${gpu_count} -> effective=${effective_vram_mb}MB" + fi + + TIER_GGUF_FILE="" + TIER_GGUF_URL="" + TIER_MODEL_SIZE_MB=0 + + # Strategy 1: Use upstream tier-map.sh if available (most accurate) + if [[ -f "$tier_map" ]]; then + local tier="" + if [[ "$gpu_backend" == "nvidia" ]]; then + if [[ $effective_vram_mb -ge 90000 ]]; then tier="NV_ULTRA" + elif [[ $effective_vram_mb -ge 40000 ]]; then tier=4 + elif [[ $effective_vram_mb -ge 20000 ]]; then tier=3 + elif [[ $effective_vram_mb -ge 12000 ]]; then tier=2 + elif [[ $effective_vram_mb -lt 4000 ]]; then tier=0 + else tier=1; fi + elif [[ "$gpu_backend" == "amd" ]]; then + if [[ $effective_vram_mb -ge 20000 ]]; then tier=3 + elif [[ $effective_vram_mb -ge 12000 ]]; then tier=2 + else tier=1; fi + else + tier=0 # CPU-only + fi + + # Source upstream tier-map in a subshell to avoid polluting our namespace + local result + result=$( + TIER="$tier" + MODEL_PROFILE="${MODEL_PROFILE:-qwen}" + error() { echo "ERROR: $*" >&2; return 1; } + source "$tier_map" 2>>"$LOGFILE" + resolve_tier_config 2>>"$LOGFILE" + echo "${GGUF_FILE}|${GGUF_URL:-}|${LLM_MODEL_SIZE_MB:-0}" + ) || result="" + + if [[ -n "$result" ]]; then + TIER_GGUF_FILE="${result%%|*}" + local rest="${result#*|}" + TIER_GGUF_URL="${rest%%|*}" + TIER_MODEL_SIZE_MB="${rest##*|}" + if [[ -n "$TIER_GGUF_FILE" ]]; then + log "Tier resolved via upstream tier-map: ${TIER_GGUF_FILE} (tier ${tier}, ${effective_vram_mb}MB effective VRAM)" + return 0 + fi + fi + fi + + # Strategy 2: Built-in VRAM lookup (fallback when tier-map.sh unavailable) + # Uses qwen profile defaults matching upstream's set_qwen_tier_config() + if [[ "$gpu_backend" == "nvidia" || "$gpu_backend" == "amd" ]]; then + local effective_vram="$effective_vram_mb" + + if [[ $effective_vram -ge 90000 ]]; then + # NV_ULTRA: B200 (180GB), multi-A100/H100, etc. + TIER_GGUF_FILE="qwen3-coder-next-Q4_K_M.gguf" + TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3-Coder-Next-GGUF/resolve/main/Qwen3-Coder-Next-Q4_K_M.gguf" + TIER_MODEL_SIZE_MB=48500 + elif [[ $effective_vram -ge 24000 ]]; then + # Tier 3-4: RTX 3090/4090 (24GB), A6000 (48GB), A100 (40/80GB), H100 (80GB) + TIER_GGUF_FILE="Qwen3-30B-A3B-Q4_K_M.gguf" + TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF/resolve/main/Qwen3-30B-A3B-Q4_K_M.gguf" + TIER_MODEL_SIZE_MB=18600 + elif [[ $effective_vram -ge 12000 ]]; then + # Tier 2: RTX 3060 (12GB), RTX 4070 (12GB), RTX 3080 Ti (12GB) + TIER_GGUF_FILE="Qwen3.5-9B-Q4_K_M.gguf" + TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf" + TIER_MODEL_SIZE_MB=5760 + elif [[ $effective_vram -ge 4000 ]]; then + # Tier 1: RTX 3070 (8GB), RTX 3080 (10GB), GPUs with 4-12GB VRAM + # 4B model (2,870 MB) leaves enough headroom for KV cache on 8GB GPUs + TIER_GGUF_FILE="Qwen3.5-4B-Q4_K_M.gguf" + TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf" + TIER_MODEL_SIZE_MB=2870 + else + # Tier 0: <4GB VRAM or CPU-only + TIER_GGUF_FILE="Qwen3.5-2B-Q4_K_M.gguf" + TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf" + TIER_MODEL_SIZE_MB=1500 + fi + else + TIER_GGUF_FILE="Qwen3.5-2B-Q4_K_M.gguf" + TIER_GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf" + TIER_MODEL_SIZE_MB=1500 + fi + + log "Tier resolved via built-in lookup: ${TIER_GGUF_FILE} (${effective_vram_mb}MB effective VRAM)" +} + +# ── [FIX: disk-check] Verify sufficient disk before starting a download ───── +# Returns 0 if enough space, 1 if insufficient. +# Args: $1 = directory to check, $2 = minimum GB required (default: 5) +check_disk_for_download() { + local target_dir="$1" + local min_gb="${2:-5}" + local avail_gb + avail_gb=$(df -BG --output=avail "$target_dir" 2>>"$LOGFILE" | tail -1 | tr -dc '0-9') + if [[ "${avail_gb:-0}" -lt "$min_gb" ]]; then + warn "Insufficient disk space: ${avail_gb}GB available, ${min_gb}GB needed in ${target_dir}" + return 1 + fi + return 0 +} + +# ── [FIX: pkill] PID-file based process management ───────────────────────── +# Store a background process PID so we can stop it safely later. +_store_pid() { + local name="$1" pid="$2" + # [NON-FATAL: pidfile] Missing pidfile dir only affects cleanup tracking. + mkdir -p "$PIDFILE_DIR" 2>>"$LOGFILE" || warn "could not create pidfile directory ${PIDFILE_DIR} (non-fatal)" + echo "$pid" > "${PIDFILE_DIR}/${name}.pid" +} + +# Kill a previously stored PID by name. Safe — only kills the exact PID. +_kill_stored_pid() { + local name="$1" + local pidfile="${PIDFILE_DIR}/${name}.pid" + [[ ! -f "$pidfile" ]] && return 0 + local pid + pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "") + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then # stderr expected: process may already have exited + # [NON-FATAL: cleanup] Process may already be gone; continue cleanup. + kill "$pid" 2>>"$LOGFILE" || warn "Could not kill ${name} (PID ${pid})" + fi + rm -f "$pidfile" +} + +# Check if a stored PID is still running. +_is_pid_running() { + local name="$1" + local pidfile="${PIDFILE_DIR}/${name}.pid" + [[ ! -f "$pidfile" ]] && return 1 + local pid + pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "") + [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: process may already have exited +} + +# Resolve download URL for a model filename +resolve_model_url() { + local ds_dir="$1" model_name="$2" + + # Strategy 1: model-upgrade log + local url + url=$(_resolve_from_log "$ds_dir" "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; } + + # Strategy 2: upstream tier-map.sh + url=$(_resolve_from_tiermap "$ds_dir" "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; } + + # Strategy 3: backend JSON configs + url=$(_resolve_from_backends "$ds_dir" "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; } + + # Strategy 4: probe common HuggingFace orgs + url=$(_resolve_from_hf_probe "$model_name") && [[ -n "$url" ]] && { echo "$url"; return 0; } + + return 1 +} + +_resolve_from_log() { + local ds_dir="$1" model_name="$2" + local upgrade_log="${ds_dir}/logs/model-upgrade.log" + [[ ! -f "$upgrade_log" ]] && return 1 + grep -oP 'https://huggingface\.co/[^\s"]+'"${model_name}" "$upgrade_log" | tail -1 || return 1 +} + +_resolve_from_tiermap() { + local ds_dir="$1" model_name="$2" + local tier_map="${ds_dir}/installers/lib/tier-map.sh" + [[ ! -f "$tier_map" ]] && return 1 + grep -oP 'https://huggingface\.co/[^\s"'"'"']+'"${model_name}" "$tier_map" | head -1 || return 1 +} + +_resolve_from_backends() { + local ds_dir="$1" model_name="$2" + local backend_dir="${ds_dir}/config/backends" + [[ ! -d "$backend_dir" ]] && return 1 + grep -rhoP 'https://huggingface\.co/[^\s"]+'"${model_name}" "$backend_dir" | head -1 || return 1 +} + +_resolve_from_hf_probe() { + local model_name="$1" + local base_name + base_name=$(echo "$model_name" | sed -E 's/-[QqFf][0-9_]+[A-Za-z]*\.gguf$//') + [[ -z "$base_name" ]] && return 1 + + local org + for org in "unsloth" "bartowski" "lmstudio-community"; do + local test_url="https://huggingface.co/${org}/${base_name}-GGUF/resolve/main/${model_name}" + if curl -sfI --max-time 10 "$test_url" | grep -qi "200\|302\|301"; then + echo "$test_url" + return 0 + fi + done + return 1 +} + +# Resume/restart incomplete model downloads with aria2c +optimize_model_download() { + local ds_dir="$1" + local data_dir="${ds_dir}/data" + + local part_files + part_files=$(find "${data_dir}/models/" -name "*.gguf.part" -type f 2>&1 || echo "") + + if [[ -z "$part_files" ]]; then + if _is_pid_running "aria2c-model"; then + log "aria2c download already running" + return 0 + fi + log "No incomplete model downloads found — models are ready" + return 0 + fi + + local part_file part_name part_size_mb gguf_url + part_file=$(echo "$part_files" | head -1) + part_name=$(basename "$part_file" .part) + part_size_mb=$(( $(stat -c%s "$part_file" || echo 0) / 1048576 )) + + warn "Incomplete download: ${part_name} (${part_size_mb} MB so far)" + + # [FIX: pkill] Kill only known PIDs, not by pattern + _kill_stored_pid "curl-model" + _kill_stored_pid "wget-model" + sleep 2 + + # [FIX: disk-check] Verify at least 5GB free before resuming + if ! check_disk_for_download "${data_dir}/models" 5; then + warn "Skipping model download — insufficient disk space" + return 0 + fi + + gguf_url=$(resolve_model_url "$ds_dir" "$part_name") || { + warn "Could not resolve download URL for ${part_name} — leaving original download" + return 0 + } + + log "Restarting download with aria2c (8 threads)..." + rm -f "$part_file" + mkdir -p "${ds_dir}/logs" + + nohup aria2c \ + -x 8 -s 8 -k 10M \ + --continue=true \ + --max-tries=0 \ + --retry-wait=5 \ + --timeout=60 \ + --connect-timeout=30 \ + --file-allocation=none \ + --auto-file-renaming=false \ + --console-log-level=warn \ + --summary-interval=30 \ + --check-integrity=true \ + -d "${data_dir}/models" \ + -o "${part_name}" \ + "${gguf_url}" \ + >> "${ds_dir}/logs/aria2c-download.log" 2>&1 & + + local aria_pid=$! + _store_pid "aria2c-model" "$aria_pid" + log "aria2c started (PID: ${aria_pid})" + create_model_swap_watcher "$ds_dir" "$part_name" +} + +# Generate and start a model swap watcher script +create_model_swap_watcher() { + local ds_dir="$1" model_name="$2" + local watcher_script="${ds_dir}/scripts/model-swap-on-complete.sh" + local pidfile_dir="${PIDFILE_DIR:-/var/run/dreamserver-p2p-gpu}" + mkdir -p "${ds_dir}/scripts" + + cat > "$watcher_script" << 'WATCHER_EOF' +#!/usr/bin/env bash +set -euo pipefail +# Auto-swap model when aria2c download completes + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +MODEL_DIR="${SCRIPT_DIR}/data/models" +ENV_FILE="${SCRIPT_DIR}/.env" +PIDFILE="__PIDFILE_DIR__/aria2c-model.pid" +TARGET_MODEL="__TARGET_MODEL__" +warn() { echo -e "\033[1;33m[!]\033[0m $*" >&2; } + +compose_cmd() { + if docker compose version &>/dev/null 2>&1; then + echo "docker compose" + elif command -v docker-compose &>/dev/null; then + echo "docker-compose" + else + echo "docker restart" + fi +} + +is_download_running() { + [[ ! -f "$PIDFILE" ]] && return 1 + local pid + pid=$(cat "$PIDFILE" 2>/dev/null || echo "") # stderr expected: pidfile can be unreadable/missing during shutdown race + [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: "No such process" while download exits +} + +swap_model() { + local new_model="$1" + local old_model + old_model=$(grep '^GGUF_FILE=' "$ENV_FILE" | cut -d= -f2 | tr -d '"' || echo "") + [[ "$new_model" == "$old_model" ]] && return 0 + + # Convert GGUF filename -> Dream model id used by other services. + # Example: Qwen3-30B-A3B-Q4_K_M.gguf -> qwen3-30b-a3b + local new_llm_model + new_llm_model=$(echo "$new_model" \ + | sed -E 's/\.(gguf|GGUF)$//' \ + | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' \ + | tr '[:upper:]' '[:lower:]') + + # Validate new model file before swapping + local model_path="${MODEL_DIR}/${new_model}" + if [[ ! -f "$model_path" ]]; then + warn "Model file not found: ${model_path} — skipping swap" + return 1 + fi + local file_size + file_size=$(stat -c%s "$model_path" 2>/dev/null || echo 0) # stderr expected: file can disappear during concurrent cleanup + if [[ "$file_size" -lt 100000000 ]]; then + warn "Model file too small (${file_size} bytes) — skipping swap" + return 1 + fi + + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Swapping: ${old_model} -> ${new_model} ($(( file_size / 1048576 )) MB)" + # [FIX: tmpfile-race] Use sed -i to avoid world-readable temp file with secrets + sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${new_model}|" "$ENV_FILE" + if grep -q '^LLM_MODEL=' "$ENV_FILE"; then + sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${new_llm_model}|" "$ENV_FILE" + else + echo "LLM_MODEL=${new_llm_model}" >> "$ENV_FILE" + fi + + # Update model size for VRAM budget calculations + local new_size_mb + new_size_mb=$(stat -c%s "$model_path" 2>/dev/null || echo 0) # stderr expected: file can disappear during cleanup + new_size_mb=$(( new_size_mb / 1048576 )) + sed -i "s|^LLM_MODEL_SIZE_MB=.*|LLM_MODEL_SIZE_MB=${new_size_mb}|" "$ENV_FILE" + if ! grep -q '^LLM_MODEL_SIZE_MB=' "$ENV_FILE"; then + echo "LLM_MODEL_SIZE_MB=${new_size_mb}" >> "$ENV_FILE" + fi + + # Use compose recreate (re-reads .env) instead of docker restart (ignores .env changes) + local cmd + cmd=$(compose_cmd) + if [[ "$cmd" == "docker compose" ]]; then + # [NON-FATAL: service] Llama restart can be retried if compose fails. + cd "$SCRIPT_DIR" && docker compose up -d llama-server || warn "compose recreate failed (non-fatal)" + elif [[ "$cmd" == "docker-compose" ]]; then + # [NON-FATAL: service] Llama restart can be retried if compose fails. + cd "$SCRIPT_DIR" && docker-compose up -d llama-server || warn "compose recreate failed (non-fatal)" + else + # [NON-FATAL: service] Restart failure should not block the watcher. + docker restart dream-llama-server || warn "llama-server restart failed (non-fatal)" + fi + # Restart dependent services so they pick up new model env / auto-detection. + for cname in dream-dreamforge dream-openclaw dream-dashboard-api; do + if docker ps --format '{{.Names}}' | grep -qx "$cname"; then + # [NON-FATAL: service] Dependent restarts are best-effort. + docker restart "$cname" || warn "${cname} restart failed (non-fatal)" + fi + done + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Swapped to ${new_model} — llama-server reloading" +} + +while true; do + if ! is_download_running; then + if [[ -n "${TARGET_MODEL:-}" && -f "${MODEL_DIR}/${TARGET_MODEL}" ]]; then + swap_model "$TARGET_MODEL" + else + local_model=$(ls -S "${MODEL_DIR}"/*.gguf 2>&1 | head -1 | xargs -r basename || echo "") + if [[ -n "${local_model:-}" ]]; then + swap_model "$local_model" + fi + fi + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Watcher exiting — download complete" + exit 0 + fi + sleep 30 +done +WATCHER_EOF + + sed -i "s|__PIDFILE_DIR__|${pidfile_dir}|g" "$watcher_script" + sed -i "s|__TARGET_MODEL__|${model_name}|g" "$watcher_script" + chmod +x "$watcher_script" + nohup "$watcher_script" >> "${ds_dir}/logs/model-swap.log" 2>&1 & + local watcher_pid=$! + _store_pid "model-swap-watcher" "$watcher_pid" + log "Model swap watcher started (PID: ${watcher_pid})" +} diff --git a/dream-server/installers/p2p-gpu/lib/networking.sh b/dream-server/installers/p2p-gpu/lib/networking.sh new file mode 100644 index 000000000..0a155efbc --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/networking.sh @@ -0,0 +1,629 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Networking & Access Layer +# ============================================================================ +# Part of: p2p-gpu/lib/ +# Purpose: Port exposure, Caddy reverse proxy, health page, Cloudflare +# tunnel, SSH tunnel script, access info display +# +# Expects: LOGFILE, SCRIPT_NAME, log(), warn(), err(), env_set(), env_get(), +# discover_all_services(), discover_service_ports() +# Provides: expose_ports_for_vastai(), setup_reverse_proxy(), +# generate_health_page(), setup_cloudflare_tunnel(), +# generate_ssh_tunnel_script(), generate_powershell_tunnel_script(), +# print_access_info() +# +# Modder notes: +# Caddy failure is non-fatal — falls back to SSH tunnel mode. +# print_access_info is split into sub-functions for maintainability. +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# Rewrite 127.0.0.1 → 0.0.0.0 in compose port bindings for Vast.ai access +expose_ports_for_vastai() { + local ds_dir="$1" + # Safety: only rebind on detected P2P GPU providers to avoid + # accidentally exposing services on non-rented machines. + if [[ -z "${VAST_TCP_PORT_22:-}" && -z "${PUBLIC_IPADDR:-}" \ + && ! -f /etc/vast.ai && "${PROVIDER_NAME:-}" != "vastai" ]]; then + warn "Not a detected P2P GPU environment — skipping port rebinding" + return 0 + fi + log "Rebinding Docker ports from 127.0.0.1 → 0.0.0.0 for Vast.ai external access" + local count=0 + while IFS= read -r -d '' compose_file; do + if grep -q '"127\.0\.0\.1:' "$compose_file"; then + sed -i 's/"127\.0\.0\.1:/"0.0.0.0:/g' "$compose_file" + count=$((count + 1)) + fi + # [NON-FATAL: discovery] Missing compose files just skips port rebinding. + done < <(find "$ds_dir" -maxdepth 4 \ + \( -name "docker-compose*.yml" -o -name "compose*.yaml" -o -name "compose*.yml" \) \ + -print0 2>&1 || warn "find compose files failed (non-fatal)") + log "Rebound ports in ${count} compose file(s) to 0.0.0.0" +} + +# Deploy Caddy reverse proxy for single-port access +setup_reverse_proxy() { + local ds_dir="$1" + local proxy_port="${2:-8080}" + local env_file="${ds_dir}/.env" + + _install_caddy || return 1 + _generate_caddyfile "$ds_dir" "$proxy_port" "$env_file" + _start_caddy "$ds_dir" "$proxy_port" "$env_file" || return 1 + _wait_for_proxy_backend "$proxy_port" +} + +_install_caddy() { + command -v caddy &>/dev/null && return 0 + log "Installing Caddy reverse proxy..." + if apt-get install -y -qq debian-keyring debian-archive-keyring apt-transport-https 2>>"$LOGFILE" \ + && curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' \ + | gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg 2>>"$LOGFILE" \ + && curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' \ + | tee /etc/apt/sources.list.d/caddy-stable.list > /dev/null \ + && apt-get update -qq 2>>"$LOGFILE" \ + && apt-get install -y -qq caddy 2>>"$LOGFILE"; then + log "Caddy installed successfully" + else + warn "Caddy install failed — falling back to SSH tunnel mode" + return 1 + fi +} + +_generate_caddyfile() { + local ds_dir="$1" proxy_port="$2" env_file="$3" + local caddy_dir="${ds_dir}/config/caddy" + mkdir -p "$caddy_dir" + + local webui_port + webui_port="$(env_get "$env_file" "WEBUI_PORT")" + webui_port="${webui_port:-3000}" + + # Dashboard is the main DreamServer panel — it owns the root route. + # Its nginx serves SPA assets and proxies /api/ → dashboard-api with auth. + local dashboard_port + dashboard_port="$(env_get "$env_file" "DASHBOARD_PORT")" + dashboard_port="${dashboard_port:-3001}" + + cat > "${caddy_dir}/Caddyfile" << CADDYEOF +# DreamServer reverse proxy — auto-generated by vastai setup +{ + auto_https off + admin off +} + +:${proxy_port} { + # Dashboard — main DreamServer panel (root route) + handle / { + reverse_proxy 127.0.0.1:${dashboard_port} + } + handle /api/* { + reverse_proxy 127.0.0.1:${dashboard_port} + } + handle /assets/* { + reverse_proxy 127.0.0.1:${dashboard_port} + } + + # Open WebUI — full access via SSH tunnel (port ${webui_port}) + handle_path /chat/* { + reverse_proxy 127.0.0.1:${webui_port} + } + + handle_path /health { + root * ${caddy_dir} + file_server + try_files /health.html + } + +CADDYEOF + + # Append auto-discovered service routes (skip root-handled services) + while IFS='|' read -r sid port_env port_def _name _cat proxy_mode _startup _cname; do + [[ -z "$port_env" || "$sid" == "open-webui" || "$sid" == "dashboard" ]] && continue + local svc_port + svc_port="$(env_get "$env_file" "$port_env")" + svc_port="${svc_port:-$port_def}" + [[ -z "$svc_port" ]] && continue + + if [[ "$proxy_mode" == "root" ]]; then + printf ' handle /%s/* {\n reverse_proxy 127.0.0.1:%s\n }\n\n' \ + "$sid" "$svc_port" >> "${caddy_dir}/Caddyfile" + else + printf ' handle_path /%s/* {\n reverse_proxy 127.0.0.1:%s\n }\n\n' \ + "$sid" "$svc_port" >> "${caddy_dir}/Caddyfile" + fi + done < <(discover_all_services "$ds_dir") + + # Ollama (base service, no manifest) + local ollama_port + ollama_port="$(env_get "$env_file" "OLLAMA_PORT")" + ollama_port="${ollama_port:-8080}" + cat >> "${caddy_dir}/Caddyfile" << CADDYTAIL + handle_path /ollama/* { + reverse_proxy 127.0.0.1:${ollama_port} + } + handle_path /v1/* { + reverse_proxy 127.0.0.1:${ollama_port} + } +} +CADDYTAIL + + generate_health_page "${caddy_dir}/health.html" "$ds_dir" +} + +_start_caddy() { + local ds_dir="$1" proxy_port="$2" env_file="$3" + local caddy_dir="${ds_dir}/config/caddy" + + if pgrep -x caddy > /dev/null 2>&1; then + local old_pid + old_pid=$(pgrep -x caddy | head -1) + # [NON-FATAL: cleanup] Old proxy process may have already exited. + kill "$old_pid" || warn "could not kill old caddy PID ${old_pid} (non-fatal)" + sleep 1 + fi + + mkdir -p "${ds_dir}/logs" + nohup caddy run --config "${caddy_dir}/Caddyfile" --adapter caddyfile \ + >> "${ds_dir}/logs/caddy-proxy.log" 2>&1 & + local caddy_pid=$! + sleep 2 + + if kill -0 "$caddy_pid" 2>&1; then + log "Caddy reverse proxy running on port ${proxy_port} (PID: ${caddy_pid})" + env_set "${env_file}" "REVERSE_PROXY_PORT" "$proxy_port" + return 0 + else + warn "Caddy failed to start — check ${ds_dir}/logs/caddy-proxy.log" + return 1 + fi +} + +_wait_for_proxy_backend() { + local proxy_port="$1" + local elapsed=0 code="000" + + while [[ "$elapsed" -lt 30 ]]; do + code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:${proxy_port}/" || echo "000") + if [[ "$code" =~ ^[23] ]]; then + return 0 + fi + sleep 3 + elapsed=$((elapsed + 3)) + done + + warn "Caddy is running on ${proxy_port}, but dashboard backend is not reachable yet (HTTP ${code})" + return 1 +} + +# Generate health dashboard HTML page +generate_health_page() { + local output_file="$1" + cat > "$output_file" << 'HEALTHEOF' + + + + + + DreamServer — Health + + + +

DreamServer Health

+

Loading...

+

Auto-refreshes every 15s

+ + + +HEALTHEOF + log "Generated health dashboard at ${output_file}" +} + +# Start Cloudflare Tunnel if token is configured +setup_cloudflare_tunnel() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local cf_token + cf_token="$(env_get "$env_file" "CLOUDFLARE_TUNNEL_TOKEN")" + [[ -z "$cf_token" ]] && return 0 + + log "Cloudflare Tunnel token detected — setting up tunnel" + if ! command -v cloudflared &>/dev/null; then + local cf_tmp="/tmp/cloudflared-$$" + local cf_checksum_url="https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.sha256" + curl -sL https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 \ + -o "$cf_tmp" || { warn "cloudflared download failed (non-fatal)"; rm -f "$cf_tmp"; return 0; } + # Verify checksum when available + local expected_sha + expected_sha=$(curl -sL --max-time 10 "$cf_checksum_url" 2>>"$LOGFILE" | awk '{print $1}' || echo "") + if [[ -n "$expected_sha" ]]; then + local actual_sha + actual_sha=$(sha256sum "$cf_tmp" | awk '{print $1}') + if [[ "$actual_sha" != "$expected_sha" ]]; then + err "cloudflared checksum mismatch (expected: ${expected_sha:0:12}…, got: ${actual_sha:0:12}…)" + rm -f "$cf_tmp" + warn "Skipping Cloudflare tunnel — binary integrity check failed" + return 0 + fi + log "cloudflared checksum verified" + else + warn "cloudflared checksum not available — skipping integrity check" + fi + mv "$cf_tmp" /usr/local/bin/cloudflared + chmod +x /usr/local/bin/cloudflared + fi + + local proxy_port + proxy_port="$(env_get "$env_file" "REVERSE_PROXY_PORT")" + proxy_port="${proxy_port:-3000}" + + mkdir -p "${ds_dir}/logs" + # [FIX: cf-token] Pass token via env var, not CLI arg (hidden from ps aux) + TUNNEL_TOKEN="$cf_token" nohup cloudflared tunnel --no-autoupdate run --token-from-env TUNNEL_TOKEN \ + >> "${ds_dir}/logs/cloudflared.log" 2>&1 & + local cf_pid=$! + # [NON-FATAL: pidfile] Missing pidfile only affects teardown cleanup. + _store_pid "cloudflared" "$cf_pid" 2>>"$LOGFILE" || warn "could not persist cloudflared pid (non-fatal)" + log "Cloudflare Tunnel started (PID: ${cf_pid}) — HTTPS access active" +} + +# Returns 0 when the supplied IP is RFC1918/private or otherwise unusable as a public endpoint. +_is_private_ip() { + local ip="$1" + + [[ -z "$ip" ]] && return 0 + + case "$ip" in + 10.*|172.1[6-9].*|172.2[0-9].*|172.3[0-1].*|192.168.*|169.254.*|127.*) return 0 ;; + *) return 1 ;; + esac +} + +# Get Vast.ai SSH connection info with proper env var handling +_get_vastai_ssh_info() { + local host_ip="" ssh_port="" + + # Priority 1: Vast.ai publishes the authoritative public IP here. + host_ip="${PUBLIC_IPADDR:-}" + ssh_port="${VAST_TCP_PORT_22:-}" + + # Priority 2: /proc/self/environ (handles SSH sessions that strip env vars) + if [[ -z "$host_ip" || -z "$ssh_port" ]]; then + if [[ -r /proc/self/environ ]]; then + if [[ -z "$host_ip" ]]; then + host_ip="$(tr '\0' '\n' < /proc/self/environ | grep '^PUBLIC_IPADDR=' | cut -d= -f2 || echo "")" + fi + if [[ -z "$ssh_port" ]]; then + ssh_port="$(tr '\0' '\n' < /proc/self/environ | grep '^VAST_TCP_PORT_22=' | cut -d= -f2 || echo "")" + fi + fi + fi + + # Priority 3: /etc/environment (Vast.ai onstart may export vars here) + if [[ -z "$host_ip" && -f /etc/environment ]]; then + host_ip="$(grep '^PUBLIC_IPADDR=' /etc/environment 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "")" # stderr expected: file may be absent or unreadable + fi + if [[ -z "$ssh_port" && -f /etc/environment ]]; then + ssh_port="$(grep '^VAST_TCP_PORT_22=' /etc/environment 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "")" # stderr expected: file may be absent or unreadable + fi + + # Discard any detected private/NAT address. + if _is_private_ip "$host_ip"; then + host_ip="" + fi + + # Priority 4: External IP detection (reliable fallback) + if [[ -z "$host_ip" ]]; then + host_ip="$(curl -sf --max-time 5 ifconfig.me 2>>"$LOGFILE" || curl -sf --max-time 5 icanhazip.com 2>>"$LOGFILE" || echo '')" + fi + if [[ -z "$ssh_port" ]]; then + ssh_port="22" + fi + + echo "${host_ip}|${ssh_port}" +} + +# Generate auto-reconnecting SSH tunnel script +generate_ssh_tunnel_script() { + local ds_dir="$1" + local host_ip ssh_port + IFS='|' read -r host_ip ssh_port <<< "$(_get_vastai_ssh_info)" + + local env_file="${ds_dir}/.env" + local entry_port + entry_port="$(env_get "$env_file" "DASHBOARD_PORT")" + entry_port="${entry_port:-3001}" + local local_proxy_port="58080" + + local script_path="${ds_dir}/connect-tunnel.sh" + { + echo '#!/usr/bin/env bash' + echo '# DreamServer — auto-reconnecting SSH tunnel (run on YOUR LOCAL machine)' + echo "HOST=\"${host_ip}\"" + echo "SSH_PORT=\"${ssh_port}\"" + echo "ENTRY_PORT=\"${entry_port}\"" + echo '_uname="$(uname -s | tr "[:upper:]" "[:lower:]")"' + echo 'case "${_uname}" in' + echo " mingw*|msys*|cygwin*) _default_local_proxy=${local_proxy_port} ;;" + echo " *) _default_local_proxy=${local_proxy_port} ;;" + echo 'esac' + echo 'LOCAL_PROXY_PORT="${LOCAL_PROXY_PORT:-${_default_local_proxy}}"' + echo 'if [[ "${FULL_TUNNEL:-0}" == "1" ]]; then' + echo ' FORWARDS="-L ${LOCAL_PROXY_PORT}:127.0.0.1:${ENTRY_PORT}' + discover_service_ports "$ds_dir" | while IFS='|' read -r key port _label; do + [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue + local_port="$port" + [[ "$port" -lt 1024 ]] && local_port=$((10000 + port)) + echo " -L ${local_port}:127.0.0.1:${port}" + done + echo ' "' + echo 'else' + echo ' FORWARDS="-L ${LOCAL_PROXY_PORT}:127.0.0.1:${ENTRY_PORT}"' + echo 'fi' + echo 'DELAY=5' + echo 'while true; do' + echo ' ssh -N -o ServerAliveInterval=15 -o ServerAliveCountMax=3 \' + echo ' -o ExitOnForwardFailure=yes \' + echo ' -p "$SSH_PORT" $FORWARDS root@"$HOST"' + echo ' echo "[!] Connection lost. Reconnecting in ${DELAY}s..."' + echo ' sleep "$DELAY"' + echo ' DELAY=$(( DELAY < 60 ? DELAY * 2 : 60 ))' + echo 'done' + } > "$script_path" + chmod +x "$script_path" + log "Generated auto-reconnecting tunnel script: ${script_path}" +} + +generate_powershell_tunnel_script() { + local ds_dir="$1" + local host_ip ssh_port + IFS='|' read -r host_ip ssh_port <<< "$(_get_vastai_ssh_info)" + + local env_file="${ds_dir}/.env" + local entry_port + entry_port="$(env_get "$env_file" "DASHBOARD_PORT")" + entry_port="${entry_port:-3001}" + local local_proxy_port="58080" + + local script_path="${ds_dir}/connect-tunnel.ps1" + { + cat << POWERSHELL_HEAD +param( + [int]\$LocalProxyPort = ${local_proxy_port}, + [int]\$ReconnectDelay = 5, + [string]\$Host = "${host_ip}", + [int]\$SshPort = ${ssh_port} +) + +\$EntryPort = ${entry_port} +while (\$true) { + \$Forwards = @( + "-L"; "\${LocalProxyPort}:127.0.0.1:\$EntryPort"; +POWERSHELL_HEAD + discover_service_ports "$ds_dir" | while IFS='|' read -r key port _label; do + [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue + lp="$port" + [[ "$port" -lt 1024 ]] && lp=$((10000 + port)) + printf ' "-L"; "%s:127.0.0.1:%s";\n' "$lp" "$port" + done + cat << 'POWERSHELL_TAIL' + ) + ssh -N -o ServerAliveInterval=15 -o ServerAliveCountMax=3 -o ExitOnForwardFailure=yes -p $SshPort @Forwards "root@$Host" + Write-Host "[!] Connection lost. Reconnecting in ${ReconnectDelay}s..." + Start-Sleep -Seconds $ReconnectDelay + if ($ReconnectDelay -lt 60) { + $ReconnectDelay = [Math]::Min($ReconnectDelay * 2, 60) + } +} +POWERSHELL_TAIL + } > "$script_path" + log "Generated PowerShell tunnel script: ${script_path}" +} + +# ── Print access info (split into sub-functions) ─────────────────────────── +print_access_info() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local host_ip ssh_port + local dash_api_status dashboard_status webui_status + IFS='|' read -r host_ip ssh_port <<< "$(_get_vastai_ssh_info)" + dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist + dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist + webui_status=$( + docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null || # stderr expected: container may not exist + docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null || # stderr expected: container may not exist + echo "missing" + ) + + echo "" + if [[ "$dash_api_status" == "running" && ( "$dashboard_status" == "running" || "$webui_status" == "running" ) ]]; then + echo -e "${CYAN}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + echo -e "${CYAN}${BOLD} DreamServer is ready on Vast.ai!${NC}" + echo -e "${CYAN}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + else + echo -e "${YELLOW}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + echo -e "${YELLOW}${BOLD} DreamServer access info (core services still starting)${NC}" + echo -e "${YELLOW}${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + fi + echo "" + echo -e "${BOLD}Working directory:${NC} ${ds_dir}" + echo -e "${BOLD}Setup log:${NC} ${LOGFILE}" + echo "" + + _print_ssh_section "$ds_dir" "$env_file" "$host_ip" "$ssh_port" + _print_service_list "$ds_dir" + _print_model_upload_help "$ds_dir" "$host_ip" "$ssh_port" + _print_commands_help "$ds_dir" +} + +_print_proxy_section() { + local ds_dir="$1" env_file="$2" host_ip="$3" + local proxy_port root_code proxy_ready="false" + proxy_port="$(env_get "$env_file" "REVERSE_PROXY_PORT")" + if [[ -n "$proxy_port" ]] && pgrep -x caddy > /dev/null 2>&1; then + root_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:${proxy_port}/" || echo "000") + [[ "$root_code" =~ ^[23] ]] && proxy_ready="true" + + # On Vast.ai, the container's internal port is remapped to a random + # external port exposed in VAST_TCP_PORT_. If that var exists, + # the public URL must use the mapped port — not the internal one. + local ext_port_var="VAST_TCP_PORT_${proxy_port}" + local ext_port="${!ext_port_var:-}" + if [[ "$proxy_ready" == "true" ]]; then + echo -e "${GREEN}${BOLD}▸ Reverse Proxy Active (single-port access!)${NC}" + else + echo -e "${YELLOW}${BOLD}▸ Reverse proxy process is running (backends still starting)${NC}" + fi + echo "" + if [[ -n "$ext_port" ]]; then + echo -e " ${BOLD}Public (Vast.ai mapped):${NC}" + echo -e " Dashboard: ${BOLD}http://${host_ip}:${ext_port}/${NC}" + echo " Open WebUI: http://${host_ip}:${ext_port}/chat/" + echo " n8n Workflows: http://${host_ip}:${ext_port}/n8n/" + echo " Health Status: http://${host_ip}:${ext_port}/health" + echo "" + echo -e " ${DIM}(Internal port ${proxy_port} is remapped by Vast.ai to ${ext_port}.)${NC}" + else + echo -e " Dashboard: ${BOLD}http://${host_ip}:${proxy_port}/${NC}" + echo " Open WebUI: http://${host_ip}:${proxy_port}/chat/" + echo " n8n Workflows: http://${host_ip}:${proxy_port}/n8n/" + echo " Health Status: http://${host_ip}:${proxy_port}/health" + echo "" + echo -e " ${YELLOW}Note:${NC} no VAST_TCP_PORT_${proxy_port} env var was found." + echo -e " If the URL above is unreachable, the Vast.ai instance didn't expose" + echo -e " port ${proxy_port}. Either:" + echo -e " • Edit the instance → add ${proxy_port} to 'On-start script' port list, or" + echo -e " • Use the SSH tunnel below (always works)." + fi + + if [[ "$proxy_ready" != "true" ]]; then + echo "" + echo -e " ${YELLOW}Warning:${NC} proxy is listening, but dashboard/open-webui backends are not healthy yet." + echo -e " Run: ${BOLD}bash ${SCRIPT_NAME} --fix${NC}" + fi + + echo "" + fi +} + +_print_ssh_section() { + local ds_dir="$1" env_file="$2" host_ip="$3" ssh_port="$4" + echo -e "${BOLD}━━━ SSH Tunnel (recommended for Vast.ai) ━━━${NC}" + echo "" + + local tunnel_flags="" + local remapped_notes="" + local entry_port windows_local_proxy_port + entry_port="$(env_get "$env_file" "DASHBOARD_PORT")" + entry_port="${entry_port:-3001}" + windows_local_proxy_port="58080" + + while IFS='|' read -r key port _label; do + [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue + local local_port="$port" + if [[ "$port" -lt 1024 ]]; then + local_port=$((10000 + port)) + remapped_notes="${remapped_notes}\n ${DIM} Port ${port} remapped to local ${local_port} (ports <1024 need admin)${NC}" + fi + tunnel_flags="${tunnel_flags} -L ${local_port}:127.0.0.1:${port}" + done < <(discover_service_ports "$ds_dir") + + echo -e " ${BOLD}Windows PowerShell (all ports, recommended):${NC}" + echo -e "${DIM}ssh -N -o ExitOnForwardFailure=yes -p ${ssh_port} -i \$env:USERPROFILE\\.ssh\\id_ed25519 -L ${windows_local_proxy_port}:127.0.0.1:${entry_port}${tunnel_flags} root@${host_ip}${NC}" + echo -e "${DIM}Open dashboard: http://127.0.0.1:3001/${NC}" + echo -e "${DIM}Easy alias: http://127.0.0.1:${windows_local_proxy_port}/${NC}" + echo "" + + echo -e " ${BOLD}Linux / macOS (all ports):${NC}" + echo -e "${DIM}ssh -N -p ${ssh_port} -i ~/.ssh/id_ed25519 -L ${windows_local_proxy_port}:127.0.0.1:${entry_port}${tunnel_flags} root@${host_ip}${NC}" + echo -e "${DIM}Open dashboard: http://127.0.0.1:3001/${NC}" + echo "" + + echo -e " ${BOLD}Auto-reconnect scripts:${NC}" + echo -e " ${DIM}Windows: scp -P ${ssh_port} root@${host_ip}:${ds_dir}/connect-tunnel.ps1 .${NC}" + echo -e " ${DIM} powershell -ExecutionPolicy Bypass -File .\\connect-tunnel.ps1${NC}" + echo -e " ${DIM}Linux/macOS/WSL: scp -P ${ssh_port} root@${host_ip}:${ds_dir}/connect-tunnel.sh .${NC}" + echo -e " ${DIM} FULL_TUNNEL=1 bash connect-tunnel.sh${NC}" + echo "" + echo -e " ${DIM}If Windows reports \"bind [127.0.0.1]:PORT: Permission denied\",${NC}" + echo -e " ${DIM}that local port is reserved by Hyper-V/WinNAT. Use a different local port:${NC}" + echo -e " ${DIM} -L 58080:127.0.0.1:${entry_port} (or any free high port)${NC}" + echo -e " ${DIM}Optional admin fix:${NC}" + echo -e " ${DIM} net stop winnat; net start winnat (run PowerShell as admin)${NC}" + echo -e " ${DIM}Check excluded ranges: netsh int ipv4 show excludedportrange protocol=tcp${NC}" + echo -e " ${DIM}If you see \"channel N: open failed: connect failed: Connection refused\",${NC}" + echo -e " ${DIM}the SSH tunnel is up, but that specific remote service is not listening yet.${NC}" + echo "" + + if [[ -n "$remapped_notes" ]]; then + echo -e "${BOLD} Remapped privileged ports:${NC}" + echo -e "$remapped_notes" + echo "" + fi + +} + +_print_service_list() { + local ds_dir="$1" + echo -e "${BOLD}Services:${NC}" + discover_service_ports "$ds_dir" | while IFS='|' read -r key port label; do + [[ "$key" == "REVERSE_PROXY_PORT" ]] && continue + printf " %-22s http://localhost:%s\n" "${label}:" "${port}" + done + echo "" +} + +_print_model_upload_help() { + local ds_dir="$1" host_ip="$2" ssh_port="$3" + echo -e "${BOLD}Upload Custom Models:${NC}" + echo " scp -P ${ssh_port} my-model.gguf root@${host_ip}:${ds_dir}/data/models/" + echo " # Then: edit .env GGUF_FILE=my-model.gguf && docker restart dream-llama-server" + echo "" +} + +_print_commands_help() { + local ds_dir="$1" + local script_name="${SCRIPT_NAME:-setup.sh}" + echo -e "${BOLD}Commands:${NC}" + echo " bash ${script_name} --status # Check health" + echo " bash ${script_name} --info # Connection details" + echo " bash ${script_name} --fix # Apply fixes + restart" + echo " bash ${script_name} --resume # Quick restart after SSH drop" + echo " bash ${script_name} --teardown # Stop all services" + echo "" +} diff --git a/dream-server/installers/p2p-gpu/lib/permissions.sh b/dream-server/installers/p2p-gpu/lib/permissions.sh new file mode 100644 index 000000000..b665104f3 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/permissions.sh @@ -0,0 +1,431 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Permission System +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/lib/ +# Purpose: POSIX ACLs, setgid, UID-specific ownership, data dir scaffolding +# +# Expects: DREAM_USER, DREAM_HOME, LOGFILE, log(), warn(), err() +# Provides: ensure_acl_tools(), apply_data_acl(), apply_multi_uid_perms(), +# fix_known_uid_requirements(), precreate_extension_data_dirs(), +# configure_dream_umask(), create_permission_fix_script() +# +# Modder notes: +# Three-layer permission system: +# 1. POSIX ACLs with default entries on data/ +# 2. Setgid bit (2775) on directories +# 3. Known UID overrides for services that check ownership at startup +# +# [FIX: shared-acl] Permission strategy: +# - Primary: setgid (2775) + POSIX ACLs → group-based access +# - Shared dirs get explicit per-UID ACLs for the writers we know about +# - setfacl is required; fail fast when unavailable +# +# Error handling — two tiers: +# HARD-FAIL (exit 1): setfacl application, acl package install, +# primary chown/chmod on data dirs — if these fail the stack +# cannot start safely. +# WARN-AND-CONTINUE (|| warn): service-specific chown for individual +# extensions (qdrant, whisper, dashboard-api) — one service failing +# ownership should not prevent the other 16 from starting. Also used +# for UID extraction (parse helper) and generated repair scripts +# (which should fix as much as possible per run). +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# Install ACL tools if missing +ensure_acl_tools() { + if ! command -v setfacl &>/dev/null; then + if ! apt-get install -y -qq acl 2>>"$LOGFILE"; then + err "Failed to install acl package — setfacl is required. Install manually: apt-get install acl" + exit 1 + fi + fi +} + +# Apply POSIX ACLs + setgid so every container UID can coexist. +# This is the PRIMARY permission mechanism — covers most services. +apply_data_acl() { + local dir="$1" + [[ ! -d "$dir" ]] && return 0 + + if ! chown -R "${DREAM_USER}:${DREAM_USER}" "$dir"; then + err "chown failed on ${dir} — cannot set base ownership for data directory" + exit 1 + fi + if ! find "$dir" -type d -exec chmod 2775 {} +; then + err "chmod dirs failed on ${dir} — cannot set setgid on data directories" + exit 1 + fi + if ! find "$dir" -type f -exec chmod 0664 {} +; then + err "chmod files failed on ${dir} — cannot set group-writable on data files" + exit 1 + fi + + if ! command -v setfacl &>/dev/null; then + err "setfacl unavailable — install with: apt-get install acl" + exit 1 + fi + + # dashboard-api runs as uid 1000 (dreamer) and needs write access to /data + # for .extensions-lock and token_counter.json. + if ! setfacl -R -d -m "u::rwx,u:1000:rwx,g::rwx,o::rx" "$dir"; then + err "Failed to apply default ACLs on ${dir} — mount may be ACL-incompatible" + exit 1 + fi + if ! setfacl -R -m "u:1000:rwx,g::rwx" "$dir"; then + err "Failed to apply current ACLs on ${dir} — mount may be ACL-incompatible" + exit 1 + fi + log "Applied POSIX ACLs on ${dir}" +} + +# [FIX: shared-acl] Apply explicit ACLs to directories with multiple writers. +# The caller must name the additional UIDs that need write access. +apply_multi_uid_perms() { + local dir="$1" reason="$2" + shift 2 + [[ ! -d "$dir" ]] && return 0 + + if ! chown -R "${DREAM_USER}:${DREAM_USER}" "$dir"; then + err "chown failed on ${dir} — cannot set base ownership for shared directory" + exit 1 + fi + if ! find "$dir" -type d -exec chmod 2775 {} +; then + err "chmod dirs failed on ${dir} — cannot set setgid on shared directories" + exit 1 + fi + if ! find "$dir" -type f -exec chmod 0664 {} +; then + err "chmod files failed on ${dir} — cannot set group-writable on shared files" + exit 1 + fi + + if ! command -v setfacl &>/dev/null; then + err "setfacl unavailable — install with: apt-get install acl" + exit 1 + fi + + local acl_suffix="" + if [[ $# -gt 0 ]]; then + acl_suffix=",$*" + fi + + if ! setfacl -R -d -m "u::rwx,g::rwx,o::rx${acl_suffix}" "$dir"; then + err "Failed to apply shared default ACLs on ${dir} — mount may be ACL-incompatible" + exit 1 + fi + if ! setfacl -R -m "u::rwx,g::rwx${acl_suffix}" "$dir"; then + err "Failed to apply shared current ACLs on ${dir} — mount may be ACL-incompatible" + exit 1 + fi + log "Applied shared ACLs on ${dir} (reason: ${reason})" +} + +# Extract numeric UID from a compose.yaml user: directive +_extract_compose_uid() { + local compose_file="$1" + [[ ! -f "$compose_file" ]] && return 0 + # [NON-FATAL: discovery] One bad compose file should not block others. + python3 -c " +import yaml, re, sys +try: + data = yaml.safe_load(open(sys.argv[1])) + services = data.get('services') or {} + for sdef in services.values(): + user = str(sdef.get('user', '')) + if not user: continue + resolved = re.sub(r'\\\$\{[A-Za-z_]+:-(\d+)\}', r'\1', user) + uid = resolved.split(':')[0].strip() + if uid.isdigit(): + print(uid) + break +except yaml.YAMLError as e: + print(f'YAML parse error in {sys.argv[1]}: {e}', file=sys.stderr) +except OSError as e: + print(f'File read error {sys.argv[1]}: {e}', file=sys.stderr) +" "$compose_file" || warn "UID extraction failed for ${compose_file} (non-fatal)" +} + +# Fix UID-specific ownership that ACLs alone don't solve +fix_known_uid_requirements() { + local data_dir="$1" + local gpu_backend="${2:-nvidia}" + local ds_dir + ds_dir=$(dirname "$data_dir") + + _fix_dynamic_uids "$ds_dir" "$data_dir" + _fix_uid_exceptions "$data_dir" "$gpu_backend" + + log "Fixed UID-specific ownership for services (dynamic + exceptions)" +} + +_fix_dynamic_uids() { + local ds_dir="$1" data_dir="$2" + local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions") + local dream_uid + dream_uid=$(id -u "$DREAM_USER" 2>>"$LOGFILE" || echo "") + for ext_root in "${ext_dirs[@]}"; do + [[ ! -d "$ext_root" ]] && continue + for ext_path in "${ext_root}"/*/; do + [[ ! -d "$ext_path" ]] && continue + local ext_name + ext_name=$(basename "$ext_path") + local ext_data="${data_dir}/${ext_name}" + local compose_file="" + for candidate in "${ext_path}compose.yaml" "${ext_path}compose.yml"; do + [[ -f "$candidate" ]] && compose_file="$candidate" && break + done + [[ -z "$compose_file" ]] && continue + local uid + uid=$(_extract_compose_uid "$compose_file") + if [[ -n "$uid" && "$uid" != "0" ]]; then + mkdir -p "$ext_data" + # best-effort: one extension failing ownership should not block others + if [[ -n "$dream_uid" && "$uid" == "$dream_uid" ]]; then + continue + fi + if ! chown -R "${uid}:${uid}" "$ext_data" 2>>"$LOGFILE"; then + warn "chown ${ext_name} to uid ${uid} failed (non-fatal) — attempting ACL fallback" + if command -v setfacl &>/dev/null; then + # [NON-FATAL: ${ext_name}] Individual service failure does not block others. + setfacl -R -m "u:${uid}:rwx" "$ext_data" 2>>"$LOGFILE" \ + || warn "setfacl ${ext_name} uid ${uid} failed (non-fatal)" + # [NON-FATAL: ${ext_name}] Individual service failure does not block others. + setfacl -R -d -m "u:${uid}:rwx" "$ext_data" 2>>"$LOGFILE" \ + || warn "setfacl default ${ext_name} uid ${uid} failed (non-fatal)" + fi + fi + fi + done + done +} + +_fix_uid_exceptions() { + local data_dir="$1" gpu_backend="$2" + + # qdrant: uid 1000, no user: in compose.yaml — explicit chown required + if [[ -d "${data_dir}/qdrant" ]]; then + # best-effort: qdrant-specific ownership — does not block other services + # [NON-FATAL: qdrant] Individual service failure does not block others. + chown -R 1000:1000 "${data_dir}/qdrant" || warn "qdrant ownership fix failed (non-fatal)" + fi + + # searxng: uid varies by image version (977 or 1000) — grant both known UIDs + if [[ -d "${data_dir}/searxng" ]]; then + apply_multi_uid_perms "${data_dir}/searxng" "uid varies by image version (977/1000)" "u:977:rwx,u:1000:rwx" + fi + + # comfyui: AMD vs NVIDIA layout + fix_comfyui_permissions "$data_dir" "$gpu_backend" + + # open-webui: grant both root (container) and uid 1000 (dream/dashboard-api) + if [[ -d "${data_dir}/open-webui" ]]; then + if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "${data_dir}/open-webui"; then + err "Failed to apply default ACLs on ${data_dir}/open-webui — mount may be ACL-incompatible" + exit 1 + fi + if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "${data_dir}/open-webui"; then + err "Failed to apply ACLs on ${data_dir}/open-webui — mount may be ACL-incompatible" + exit 1 + fi + fi + + # whisper: grant known writers uid 1000 + root for cache/bootstrap flows + if [[ -d "${data_dir}/whisper" ]]; then + # best-effort: whisper ownership — ACLs above enforce access regardless + # [NON-FATAL: whisper] Individual service failure does not block others. + chown -R 1000:1000 "${data_dir}/whisper" || warn "whisper chown failed (non-fatal)" + if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "${data_dir}/whisper"; then + err "Failed to apply default ACLs on ${data_dir}/whisper — mount may be ACL-incompatible" + exit 1 + fi + if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "${data_dir}/whisper"; then + err "Failed to apply ACLs on ${data_dir}/whisper — mount may be ACL-incompatible" + exit 1 + fi + fi + + # dashboard-api: uid 1000 (dreamer) — needs rw on data/ and .env + local ds_dir + ds_dir=$(dirname "$data_dir") + if [[ -d "${data_dir}/dashboard-api" ]]; then + # best-effort: dashboard-api ownership — service starts as uid 1000 regardless + # [NON-FATAL: dashboard-api] Individual service failure does not block others. + chown -R 1000:1000 "${data_dir}/dashboard-api" || warn "dashboard-api chown failed (non-fatal)" + fi + if command -v setfacl &>/dev/null && [[ -f "${ds_dir}/.env" ]]; then + if ! setfacl -m u:1000:rw "${ds_dir}/.env"; then + err "Failed to apply ACL on ${ds_dir}/.env for dashboard-api" + exit 1 + fi + fi + + # models (shared): grant the non-root writer used by the p2p-gpu toolkit + if [[ -d "${data_dir}/models" ]]; then + apply_multi_uid_perms "${data_dir}/models" "multi-service write: llama-server, comfyui, aria2c" "u:1000:rwx" + fi +} + +# Pre-create data directories for all known extensions +precreate_extension_data_dirs() { + local ds_dir="$1" + local data_dir="${ds_dir}/data" + local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions") + + for ext_root in "${ext_dirs[@]}"; do + [[ ! -d "$ext_root" ]] && continue + for manifest in "${ext_root}"/*/manifest.yaml; do + [[ ! -f "$manifest" ]] && continue + local ext_name + ext_name=$(basename "$(dirname "$manifest")") + mkdir -p "${data_dir}/${ext_name}" + done + done + + # Pre-create ComfyUI bind-mount paths so Docker doesn't auto-create root-owned + # 0755 directories that are unwritable for the non-root comfyui user. + mkdir -p "${data_dir}/comfyui/models" \ + "${data_dir}/comfyui/models/checkpoints" \ + "${data_dir}/comfyui/output" \ + "${data_dir}/comfyui/input" \ + "${data_dir}/comfyui/workflows" \ + "${data_dir}/comfyui/ComfyUI/models" \ + "${data_dir}/comfyui/ComfyUI/output" \ + "${data_dir}/comfyui/ComfyUI/input" \ + "${data_dir}/comfyui/ComfyUI/custom_nodes" + + # [NON-FATAL: extensions] Optional user-extensions directory. + mkdir -p "${ds_dir}/user-extensions" || warn "could not create user-extensions (non-fatal)" + log "Pre-created data directories for all known extensions" +} + +# Set dream user's umask for group-writable files +configure_dream_umask() { + for f in "${DREAM_HOME}/.bashrc" "${DREAM_HOME}/.profile"; do + if [[ -f "$f" ]] && ! grep -q 'umask 0002' "$f"; then + printf '\n# DreamServer: group-writable files by default\numask 0002\n' >> "$f" + fi + done +} + +# Generate standalone permission-fix script +create_permission_fix_script() { + local ds_dir="$1" + local uid_fix_lines="" + + local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions") + for ext_root in "${ext_dirs[@]}"; do + [[ ! -d "$ext_root" ]] && continue + for ext_path in "${ext_root}"/*/; do + [[ ! -d "$ext_path" ]] && continue + local ext_name + ext_name=$(basename "$ext_path") + for candidate in "${ext_path}compose.yaml" "${ext_path}compose.yml"; do + [[ ! -f "$candidate" ]] && continue + local uid + uid=$(_extract_compose_uid "$candidate") + if [[ -n "$uid" && "$uid" != "0" ]]; then + # [NON-FATAL: fix-script] Generated fixer is best-effort by design. + uid_fix_lines+="[[ -d \"\${DATA_DIR}/${ext_name}\" ]] && chown -R ${uid}:${uid} \"\${DATA_DIR}/${ext_name}\" || warn \"${ext_name} chown failed (non-fatal)\""$'\n' + fi + break + done + done + done + + mkdir -p "${ds_dir}/scripts" + cat > "${ds_dir}/scripts/fix-permissions.sh" << PERMFIX_EOF +#!/usr/bin/env bash +set -euo pipefail +# DreamServer permission fixer — auto-generated, safe to run anytime. +SCRIPT_DIR="\$(cd "\$(dirname "\$0")/.." && pwd)" +DATA_DIR="\${SCRIPT_DIR}/data" +warn() { echo -e "\033[1;33m[!]\033[0m \$*" >&2; } + +echo "[*] Fixing permissions on \${DATA_DIR}..." + +if ! command -v setfacl &>/dev/null; then + echo "[x] setfacl unavailable — install with: apt-get install acl" >&2 + exit 1 +fi + +find "\$DATA_DIR" -type d -exec chmod 2775 {} + || warn "chmod dirs failed (non-fatal)" +find "\$DATA_DIR" -type f -exec chmod 0664 {} + || warn "chmod files failed (non-fatal)" +if ! setfacl -R -d -m "u::rwx,u:1000:rwx,g::rwx,o::rx" "\$DATA_DIR"; then + echo "[x] Failed to apply default ACLs on \$DATA_DIR — mount may be ACL-incompatible" >&2 + exit 1 +fi +if ! setfacl -R -m "u:1000:rwx,g::rwx" "\$DATA_DIR"; then + echo "[x] Failed to apply current ACLs on \$DATA_DIR — mount may be ACL-incompatible" >&2 + exit 1 +fi + +${uid_fix_lines} +[[ -d "\${DATA_DIR}/qdrant" ]] && chown -R 1000:1000 "\${DATA_DIR}/qdrant" || warn "qdrant fix failed (non-fatal)" +if [[ -d "\${DATA_DIR}/open-webui" ]]; then + if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/open-webui"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi + if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "\${DATA_DIR}/open-webui"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi +fi +[[ -d "\${DATA_DIR}/whisper" ]] && chown -R 1000:1000 "\${DATA_DIR}/whisper" || warn "whisper chown failed (non-fatal)" +if [[ -d "\${DATA_DIR}/whisper" ]]; then + if ! setfacl -R -d -m "u::rwx,u:0:rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/whisper"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi + if ! setfacl -R -m "u:0:rwx,u:1000:rwx,g::rwx" "\${DATA_DIR}/whisper"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi +fi +# Multi-UID directories: searxng (uid varies), models (non-root writer) +if [[ -d "\${DATA_DIR}/searxng" ]]; then + if ! setfacl -R -d -m "u::rwx,u:977:rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/searxng"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi + if ! setfacl -R -m "u:977:rwx,u:1000:rwx,g::rwx" "\${DATA_DIR}/searxng"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi +fi +if [[ -d "\${DATA_DIR}/models" ]]; then + if ! setfacl -R -d -m "u::rwx,u:1000:rwx,g::rwx,o::rx" "\${DATA_DIR}/models"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi + if ! setfacl -R -m "u:1000:rwx,g::rwx" "\${DATA_DIR}/models"; then + echo "[x] Failed to apply ACLs — mount may be ACL-incompatible" >&2 + exit 1 + fi +fi + +for d in \ + "\${DATA_DIR}/comfyui/models" \ + "\${DATA_DIR}/comfyui/models/checkpoints" \ + "\${DATA_DIR}/comfyui/output" \ + "\${DATA_DIR}/comfyui/input" \ + "\${DATA_DIR}/comfyui/workflows" \ + "\${DATA_DIR}/comfyui/ComfyUI/models" \ + "\${DATA_DIR}/comfyui/ComfyUI/output" \ + "\${DATA_DIR}/comfyui/ComfyUI/input" \ + "\${DATA_DIR}/comfyui/ComfyUI/custom_nodes"; do + mkdir -p "\$d" || warn "comfyui mkdir failed on \$d (non-fatal)" + [[ -d "\$d" ]] && chmod 2775 "\$d" || warn "comfyui dir mode fix failed on \$d (non-fatal)" +done + +find "\${SCRIPT_DIR}/scripts" -name "*.sh" -exec chmod +x {} + || warn "scripts chmod failed (non-fatal)" +echo "[✓] Permissions fixed" +PERMFIX_EOF + + chmod +x "${ds_dir}/scripts/fix-permissions.sh" + log "Created reusable permission fixer: ${ds_dir}/scripts/fix-permissions.sh" +} diff --git a/dream-server/installers/p2p-gpu/lib/services.sh b/dream-server/installers/p2p-gpu/lib/services.sh new file mode 100644 index 000000000..ced8fff77 --- /dev/null +++ b/dream-server/installers/p2p-gpu/lib/services.sh @@ -0,0 +1,797 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Service Discovery & Management +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/lib/ +# Purpose: Manifest-driven service discovery, port enumeration, compose +# command detection, Docker image pre-pull, service startup +# +# Expects: DREAM_USER, LOGFILE, log(), warn(), err(), env_get(), env_set(), +# expose_ports_for_vastai() +# Provides: read_manifest_field(), discover_all_services(), +# discover_service_ports(), extract_compose_uid(), +# get_compose_cmd(), start_services(), prepull_docker_images() +# +# Modder notes: +# Requires python3 + PyYAML (installed in Phase 1). Functions gracefully +# return empty when python3/PyYAML is unavailable. +# [FIX: python-except] Python catches only specific exceptions with logging. +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +# Resolve dream-network gateway for host-agent binding. +_resolve_dream_network_gateway() { + local gateway + gateway=$(docker network inspect dream-network \ + --format '{{(index .IPAM.Config 0).Gateway}}' 2>>"$LOGFILE" | head -1 || echo "") + gateway=$(echo "$gateway" | xargs) + if [[ "$gateway" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$gateway" + return 0 + fi + return 1 +} + +_is_loopback_addr() { + case "$1" in + ""|"127.0.0.1"|"localhost"|"::1") return 0 ;; + *) return 1 ;; + esac +} + +_restart_host_agent() { + local ds_dir="$1" + local dream_cli="${ds_dir}/dream-cli" + + if [[ ! -x "$dream_cli" ]]; then + warn "dream-cli not found at ${dream_cli} — skipping host agent restart" + return 1 + fi + + # [NON-FATAL: host-agent] Restart can be retried manually if it fails. + su - "$DREAM_USER" -c "cd ${ds_dir} && DREAM_HOME=${ds_dir} ./dream-cli agent restart" \ + >> "$LOGFILE" 2>&1 || { warn "Host agent restart failed (non-fatal)"; return 1; } + return 0 +} + +# Ensure host agent binds to the Dream network gateway so containers can reach it. +_ensure_host_agent_network_binding() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + [[ ! -f "$env_file" ]] && return 0 + + local gateway + gateway=$(_resolve_dream_network_gateway) || return 0 + + local bind host updated=false + bind="$(env_get "$env_file" "DREAM_AGENT_BIND")" + host="$(env_get "$env_file" "DREAM_AGENT_HOST")" + + if _is_loopback_addr "$bind"; then + env_set "$env_file" "DREAM_AGENT_BIND" "$gateway" + updated=true + fi + if _is_loopback_addr "$host"; then + env_set "$env_file" "DREAM_AGENT_HOST" "$gateway" + updated=true + fi + + if [[ "$updated" == "true" ]]; then + log "Pinned host agent binding to dream-network gateway ${gateway}" + _restart_host_agent "$ds_dir" || warn "Host agent restart after bind update failed (non-fatal)" + fi +} + +# Generate a p2p-gpu compose overlay that fixes multi-GPU device reservation +# and passes GPU assignment env vars that the upstream multigpu overlay omits. +# Only generated when GPU_COUNT >= 2 and GPU_BACKEND is nvidia. +# The overlay is appended to compose_flags so it merges on top of whatever +# the upstream resolver selected. +_generate_p2p_gpu_overlay() { + local ds_dir="$1" + local overlay="${ds_dir}/docker-compose.p2p-gpu.yml" + local backend="${GPU_BACKEND:-}" + + if [[ "${GPU_COUNT:-0}" -lt 2 ]]; then + if [[ -f "$overlay" ]]; then + # [NON-FATAL: overlay] Cleanup failure should not block install flow. + rm -f "$overlay" || warn "Failed to remove p2p-gpu overlay (non-fatal)" + fi + return 0 + fi + + if [[ -z "$backend" ]]; then + backend=$(detect_gpu_backend) + fi + + if [[ "$backend" == "nvidia" ]]; then + cat > "$overlay" << 'P2P_OVERLAY_EOF' +# Auto-generated by p2p-gpu toolkit - do not edit manually. +# Ensures multi-GPU device reservation and GPU assignment env vars. +services: + llama-server: + environment: + LLAMA_ARG_MAIN_GPU: "${LLAMA_ARG_MAIN_GPU:-}" + GGML_CUDA_P2P: "${GGML_CUDA_P2P:-}" + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] +P2P_OVERLAY_EOF + elif [[ "$backend" == "amd" ]]; then + cat > "$overlay" << 'P2P_OVERLAY_EOF' +# Auto-generated by p2p-gpu toolkit - do not edit manually. +services: + llama-server: + environment: + LLAMA_ARG_TENSOR_SPLIT: "${LLAMA_ARG_TENSOR_SPLIT:-}" +P2P_OVERLAY_EOF + else + if [[ -f "$overlay" ]]; then + # [NON-FATAL: overlay] Cleanup failure should not block install flow. + rm -f "$overlay" || warn "Failed to remove p2p-gpu overlay (non-fatal)" + fi + return 0 + fi + + if [[ -f "$overlay" ]]; then + # [NON-FATAL: overlay] Ownership fix failure should not block compose usage. + chown "${DREAM_USER}:${DREAM_USER}" "$overlay" || warn "Overlay ownership fix failed (non-fatal)" + # [NON-FATAL: overlay] Mode fix failure should not block compose usage. + chmod 0644 "$overlay" || warn "Overlay chmod failed (non-fatal)" + log "Generated p2p-gpu compose overlay: ${overlay}" + fi +} + +# Ensure Dream host agent is running so Dashboard model downloads can start. +_ensure_host_agent_running() { + local ds_dir="$1" + local dream_cli="${ds_dir}/dream-cli" + local agent_port agent_bind + local agent_probe + + if [[ ! -x "$dream_cli" ]]; then + warn "dream-cli not found at ${dream_cli} — skipping host agent auto-start" + return 0 + fi + + agent_port="$(grep '^DREAM_AGENT_PORT=' "${ds_dir}/.env" 2>/dev/null | cut -d= -f2 | tr -d '[:space:]' || echo "7710")" # stderr expected: .env may not exist + agent_port="${agent_port:-7710}" + agent_bind="$(grep '^DREAM_AGENT_BIND=' "${ds_dir}/.env" 2>/dev/null | cut -d= -f2 | tr -d '[:space:]' || echo "127.0.0.1")" # stderr expected: .env may not exist + agent_bind="${agent_bind:-127.0.0.1}" + agent_probe="$agent_bind" + if [[ "$agent_probe" == "0.0.0.0" || "$agent_probe" == "::" ]]; then + agent_probe="127.0.0.1" + fi + + if curl -sf --max-time 2 "http://${agent_probe}:${agent_port}/health" >/dev/null 2>&1; then + log "Host agent already running on port ${agent_port}" + return 0 + fi + + if ! command -v python3 &>/dev/null; then + warn "python3 not found — host agent cannot start" + warn "Install: apt-get install -y python3" + return 1 + fi + + local agent_script="${ds_dir}/bin/dream-host-agent.py" + if [[ ! -f "$agent_script" ]]; then + warn "Agent script not found at ${agent_script} — skipping" + return 1 + fi + + local attempt pid_file="${ds_dir}/data/dream-host-agent.pid" wait_elapsed + for attempt in 1 2; do + log "Starting host agent (attempt ${attempt}/2)..." + # [NON-FATAL: host-agent] Start failure can be retried or handled manually. + su - "$DREAM_USER" -c "cd ${ds_dir} && DREAM_HOME=${ds_dir} ./dream-cli agent start" \ + >> "$LOGFILE" 2>&1 || warn "dream-cli agent start returned non-zero (attempt ${attempt})" + + wait_elapsed=0 + while [[ $wait_elapsed -lt 20 ]]; do + sleep 3 + wait_elapsed=$((wait_elapsed + 3)) + if curl -sf --max-time 2 "http://${agent_probe}:${agent_port}/health" >/dev/null 2>&1; then + log "Host agent verified running on port ${agent_port} (attempt ${attempt})" + return 0 + fi + done + + if [[ $attempt -eq 1 ]]; then + warn "Host agent not responding after start — retrying..." + if [[ -f "$pid_file" ]]; then + # [NON-FATAL: cleanup] Stale pid cleanup should not block host agent retry. + kill "$(cat "$pid_file")" 2>>"$LOGFILE" || warn "stale host agent pid in ${pid_file} could not be killed" + rm -f "$pid_file" + fi + fi + done + + warn "Host agent failed to start after 2 attempts" + warn "Manual start: su - ${DREAM_USER} -c 'cd ${ds_dir} && DREAM_HOME=${ds_dir} ./dream-cli agent start'" + warn "Check logs: cat ${ds_dir}/data/dream-host-agent.log" + return 1 +} + +# Ensure OpenCode web is reachable on no-systemd hosts (Vast.ai fallback). +_ensure_opencode_web_running() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local opencode_bin="/home/${DREAM_USER}/.opencode/bin/opencode" + local opencode_port opencode_password escaped_password launch_dir escaped_launch_dir + + opencode_port=$(env_get "$env_file" "OPENCODE_PORT") + opencode_port="${opencode_port:-3003}" + + if curl -sf --max-time 3 "http://127.0.0.1:${opencode_port}/" >/dev/null 2>&1; then + log "OpenCode web already reachable on port ${opencode_port}" + return 0 + fi + + if [[ ! -x "$opencode_bin" ]]; then + warn "OpenCode binary not found at ${opencode_bin} — skipping OpenCode web auto-start" + return 0 + fi + + opencode_password=$(env_get "$env_file" "OPENCODE_SERVER_PASSWORD") + if [[ -z "$opencode_password" ]]; then + opencode_password=$(openssl rand -base64 16) + env_set "$env_file" "OPENCODE_SERVER_PASSWORD" "$opencode_password" + log "Generated OPENCODE_SERVER_PASSWORD for secure OpenCode web access" + fi + + launch_dir="$ds_dir" + if ! su - "$DREAM_USER" -c "test -r $(printf '%q' "$ds_dir") && test -x $(printf '%q' "$ds_dir")"; then + launch_dir="$DREAM_HOME" + warn "OpenCode launch dir ${ds_dir} is not accessible to ${DREAM_USER}; using ${launch_dir}" + fi + + mkdir -p "${ds_dir}/logs" + escaped_password=$(printf '%q' "$opencode_password") + escaped_launch_dir=$(printf '%q' "$launch_dir") + if su - "$DREAM_USER" -c \ + "cd ${escaped_launch_dir} && OPENCODE_SERVER_PASSWORD=${escaped_password} nohup ${opencode_bin} web --hostname 0.0.0.0 --port ${opencode_port} >> ${ds_dir}/logs/opencode-web.log 2>&1 &" \ + >> "$LOGFILE" 2>&1; then + sleep 2 + if curl -sf --max-time 4 "http://127.0.0.1:${opencode_port}/" >/dev/null 2>&1; then + log "Started OpenCode web fallback on port ${opencode_port}" + else + warn "OpenCode fallback launch command succeeded but service is not reachable yet" + fi + else + warn "OpenCode fallback launch failed (non-fatal)" + fi +} + +_normalize_dashboard_api_port_envs() { + local env_file="$1" + + [[ -f "$env_file" ]] || return 0 + + python3 - "$env_file" <<'PY' +from pathlib import Path +import re +import sys + +path = Path(sys.argv[1]) +text = path.read_text() +pattern = re.compile(r'^([A-Z0-9_]+_PORT)=(\d+)\s+#.*$') +changed = [] +lines = [] + +for line in text.splitlines(): + match = pattern.match(line) + if match: + line = f"{match.group(1)}={match.group(2)}" + changed.append(match.group(1)) + lines.append(line) + +new_text = "\n".join(lines) + ("\n" if text.endswith("\n") else "") +if new_text != text: + path.write_text(new_text) + if changed: + print("\n".join(changed)) +PY +} + +# Read a field from a manifest.yaml service: block +read_manifest_field() { + local manifest="$1" field="$2" + # [NON-FATAL: discovery] A single bad manifest should not block others. + python3 -c " +import yaml, sys +try: + data = yaml.safe_load(open(sys.argv[1])) + svc = data.get('service') or {} + val = svc.get(sys.argv[2], '') + if isinstance(val, list): + print(' '.join(str(v) for v in val)) + else: + print(val) +except yaml.YAMLError as e: + print(f'YAML parse error in {sys.argv[1]}: {e}', file=sys.stderr) +except OSError as e: + print(f'File read error {sys.argv[1]}: {e}', file=sys.stderr) +" "$manifest" "$field" || warn "manifest field read failed for ${manifest}:${field} (non-fatal)" +} + +# Discover all enabled services from extension manifests. +# Usage: discover_all_services [hints_file] +# Output: ID|PORT_ENV|PORT_DEFAULT|NAME|CATEGORY|PROXY_MODE|STARTUP_BEHAVIOR|CONTAINER_NAME +discover_all_services() { + local ds_dir="$1" + local hints_file="${2:-}" + if [[ -z "$hints_file" && -n "${SCRIPT_DIR:-}" ]]; then + hints_file="${SCRIPT_DIR}/config/service-hints.yaml" + fi + local ext_dirs=("${ds_dir}/extensions/services" "${ds_dir}/user-extensions") + + for ext_root in "${ext_dirs[@]}"; do + [[ ! -d "$ext_root" ]] && continue + for manifest in "${ext_root}"/*/manifest.yaml; do + [[ ! -f "$manifest" ]] && continue + # [NON-FATAL: discovery] A single bad manifest should not block others. + python3 -c "import os, yaml, sys; data = yaml.safe_load(open(sys.argv[1])) or {}; svc = data.get('service') or {}; sid = svc.get('id', ''); port_env = svc.get('external_port_env', ''); port_def = svc.get('external_port_default', ''); name = svc.get('name', sid); cat = svc.get('category', 'optional'); hints = {}; hints_path = sys.argv[2] if len(sys.argv) > 2 else ''; hints = ((yaml.safe_load(open(hints_path)) or {}).get(sid, {}) if (hints_path and os.path.exists(hints_path) and sid) else {}); proxy = hints.get('proxy_mode', svc.get('proxy_mode', 'simple')); startup = hints.get('startup_behavior', svc.get('startup_behavior', 'normal')); cname = svc.get('container_name', ''); htimeout = svc.get('health_timeout', 0); startup = 'heavy' if startup == 'normal' and isinstance(htimeout, (int, float)) and htimeout > 20 else startup; print(f'{sid}|{port_env}|{port_def}|{name}|{cat}|{proxy}|{startup}|{cname}') if sid else None" "$manifest" "$hints_file" || warn "service discovery failed for ${manifest} (non-fatal)" + done + done +} + +# Discover service ports from .env / manifests. +# Output: SERVICE_KEY|PORT_NUMBER|LABEL +# Reads explicit _PORT= lines from .env, then fills in manifest defaults +# for any services whose port_env isn't already set. +discover_service_ports() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local env_example="${ds_dir}/.env.example" + + declare -A PORT_LABELS PORT_DEFAULTS SEEN_KEYS + while IFS='|' read -r _id port_env port_def svc_name _rest; do + [[ -z "$port_env" ]] && continue + PORT_LABELS["$port_env"]="$svc_name" + [[ -n "$port_def" ]] && PORT_DEFAULTS["$port_env"]="$port_def" + done < <(discover_all_services "$ds_dir") + + local source_file="$env_file" + [[ ! -f "$source_file" ]] && source_file="$env_example" + [[ ! -f "$source_file" ]] && return 0 + + # Emit ports explicitly set in .env + awk -F= '/^[A-Z_]+_PORT=/{print}' "$source_file" | while IFS='=' read -r key value; do + value=$(echo "$value" | sed 's/[[:space:]]#.*$//' | tr -d '"' | tr -d "'" | xargs) + [[ -z "$value" ]] && continue + local label="${PORT_LABELS[$key]:-$key}" + echo "${key}|${value}|${label}" + done + + # Track which keys were already emitted + while IFS= read -r key; do + SEEN_KEYS["$key"]=1 + done < <(awk -F= '/^[A-Z_]+_PORT=/{print $1}' "$source_file") + + # Fill in manifest defaults for services not in .env + for key in "${!PORT_DEFAULTS[@]}"; do + [[ -n "${SEEN_KEYS[$key]:-}" ]] && continue + local label="${PORT_LABELS[$key]:-$key}" + echo "${key}|${PORT_DEFAULTS[$key]}|${label}" + done +} + +# Detect available compose command +get_compose_cmd() { + if docker compose version &>/dev/null; then + echo "docker compose" + elif command -v docker-compose &>/dev/null; then + echo "docker-compose" + else + err "Neither 'docker compose' nor 'docker-compose' found" + exit 1 + fi +} + +# Pre-pull Docker images in parallel +prepull_docker_images() { + local ds_dir="$1" + local max_parallel="${2:-4}" + + local images + images=$(grep -rh 'image:' "${ds_dir}"/docker-compose*.yml \ + "${ds_dir}"/extensions/services/*/compose*.y*ml 2>&1 \ + | sed -E 's/.*image:\s*//' | tr -d '"' | tr -d "'" \ + | sort -u | grep -v '^\$' || echo "") + + if [[ -z "$images" ]]; then + log "No Docker images found to pre-pull" + return 0 + fi + + local count + count=$(echo "$images" | wc -l) + log "Pre-pulling ${count} Docker images (${max_parallel} parallel)..." + + # [NON-FATAL: images] Images can be pulled later during compose up. + echo "$images" | xargs -P "$max_parallel" -I {} sh -c \ + 'docker pull {} >/dev/null 2>&1 && echo " pulled: {}" || echo " skip: {} (will retry at compose up)"' \ + || warn "some image pulls failed (non-fatal)" + + log "Docker image pre-pull complete" +} + +# ── Remove stale Docker network ──────────────────────────────────────────── +_cleanup_stale_network() { + if ! docker network inspect dream-network >/dev/null 2>&1; then + return 0 + fi + local net_label + net_label=$(docker network inspect dream-network \ + --format '{{index .Labels "com.docker.compose.network"}}' 2>&1 || echo "") + if [[ -n "$net_label" ]]; then + return 0 + fi + log "Removing stale dream-network (missing compose labels)..." + for cid in $(docker network inspect dream-network \ + -f '{{range .Containers}}{{.Name}} {{end}}' 2>&1 || echo ""); do + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker network disconnect -f dream-network "$cid" || warn "disconnect ${cid} failed (non-fatal)" + done + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker network rm dream-network || warn "network rm failed (non-fatal)" +} + +_set_safe_llama_cpu_caps() { + local env_file="$1" max_cpu="$2" + [[ ! -f "$env_file" ]] && return 0 + + local llama_limit="${max_cpu}.0" + local llama_reservation="2.0" + if [[ "$max_cpu" -lt 2 ]]; then + llama_reservation="1.0" + fi + + env_set "$env_file" "LLAMA_CPU_LIMIT" "$llama_limit" + env_set "$env_file" "LLAMA_CPU_RESERVATION" "$llama_reservation" +} + +_extract_cpu_ceiling_from_compose_error() { + local compose_err="$1" + local ceiling="" + + ceiling=$(tr -d '\r' < "$compose_err" | grep -Eo 'range of CPUs is from [0-9.]+ to [0-9.]+' 2>>"$LOGFILE" \ + | head -1 | awk '{print $NF}' | cut -d'.' -f1 || echo "") + + if [[ -z "$ceiling" ]]; then + ceiling=$(tr -d '\r' < "$compose_err" | grep -Eo 'only [0-9]+ CPUs available' 2>>"$LOGFILE" \ + | head -1 | awk '{print $2}' || echo "") + fi + + if [[ "$ceiling" =~ ^[0-9]+$ ]] && [[ "$ceiling" -gt 0 ]]; then + echo "$ceiling" + fi +} + +_compose_output_has_cpu_error() { + local compose_err="$1" + tr -d '\r' < "$compose_err" | grep -Eqi "range of CPUs is from|only [0-9]+ CPUs available|invalid.*cpu|NanoCPUs" +} + +_resolve_compose_files_from_flags() { + local ds_dir="$1" compose_flags="$2" + local prev="" token + + for token in $compose_flags; do + if [[ "$prev" == "-f" ]]; then + if [[ "$token" == /* ]]; then + echo "$token" + else + echo "${ds_dir}/${token}" + fi + prev="" + continue + fi + [[ "$token" == "-f" ]] && prev="-f" + done +} + +_compose_ansi_flag() { + local compose_cmd="$1" + case "$compose_cmd" in + "docker compose") echo "--ansi never" ;; + "docker-compose") echo "--no-ansi" ;; + *) echo "" ;; + esac +} + +_compose_list_services() { + local ds_dir="$1" compose_cmd="$2" compose_flags="$3" + local ansi_flag cmd + ansi_flag=$(_compose_ansi_flag "$compose_cmd") + cmd="${compose_cmd}" + [[ -n "$ansi_flag" ]] && cmd="${cmd} ${ansi_flag}" + cmd="${cmd} ${compose_flags} config --services" + + su - "$DREAM_USER" -c "cd ${ds_dir} && ${cmd}" 2>>"$LOGFILE" +} + +_extract_missing_image_services() { + local compose_err="$1" + local matched_lines status=0 + matched_lines=$(tr -d '\r' < "$compose_err" | grep -Ei 'Error manifest for|pull access denied for') || status=$? + # grep exit: 0 = matched, 1 = no match (expected), >1 = real error + if (( status > 1 )); then + warn "grep failed scanning compose stderr for missing-image errors (status ${status})" + fi + [[ -z "$matched_lines" ]] && return 0 + + local service + while IFS= read -r line; do + service="" + local cleaned="${line//\'/}" + cleaned="${cleaned//\"/}" + if [[ "$cleaned" =~ ^[[:space:]]*([a-zA-Z0-9._-]+)[[:space:]]+(Error[[:space:]]+manifest[[:space:]]+for|pull[[:space:]]+access[[:space:]]+denied[[:space:]]+for) ]]; then + service="${BASH_REMATCH[1]}" + elif [[ "$cleaned" =~ [Ss]ervice[[:space:]]*([a-zA-Z0-9._-]+) ]]; then + service="${BASH_REMATCH[1]}" + elif [[ "$cleaned" =~ ^([a-zA-Z0-9._-]+)[[:space:]]*[\|:] ]]; then + service="${BASH_REMATCH[1]}" + fi + [[ -n "$service" ]] && echo "$service" + done <<< "$matched_lines" | sort -u +} + +_compose_up_with_flags() { + local ds_dir="$1" compose_cmd="$2" compose_flags="$3" compose_err="$4" up_flags="$5" + shift 5 + local ansi_flag cmd service_args + ansi_flag=$(_compose_ansi_flag "$compose_cmd") + cmd="${compose_cmd}" + [[ -n "$ansi_flag" ]] && cmd="${cmd} ${ansi_flag}" + cmd="${cmd} ${compose_flags} up -d" + [[ -n "$up_flags" ]] && cmd="${cmd} ${up_flags}" + if [[ "$#" -gt 0 ]]; then + printf -v service_args ' %q' "$@" + cmd="${cmd}${service_args}" + fi + + su - "$DREAM_USER" -c "cd ${ds_dir} && ${cmd}" 2>&1 \ + | tee -a "$LOGFILE" | tee "$compose_err" +} + +_apply_host_cpu_caps() { + local ds_dir="$1" env_file="$2" daemon_ceiling="${3:-}" compose_flags="${4:-}" + local nproc_count docker_ncpu compose_ceiling max_cpu + local -a compose_files=() + + nproc_count=$(nproc 2>>"$LOGFILE" || echo 1) + docker_ncpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "unknown") + compose_ceiling=$(get_compose_cpu_ceiling) + max_cpu=$(compute_safe_cpu_cap "$daemon_ceiling") + + cap_cpu_in_yaml "$ds_dir" "$max_cpu" + if [[ -n "$compose_flags" ]]; then + mapfile -t compose_files < <(_resolve_compose_files_from_flags "$ds_dir" "$compose_flags") + if [[ "${#compose_files[@]}" -gt 0 ]]; then + cap_cpu_in_files "$max_cpu" "${compose_files[@]}" + fi + fi + _set_safe_llama_cpu_caps "$env_file" "$max_cpu" + log "Ensured compose CPU limits <= ${max_cpu} cores (nproc=${nproc_count}, docker=${docker_ncpu}, ceiling=${compose_ceiling}${daemon_ceiling:+, daemon=${daemon_ceiling}})" +} + +_compose_up() { + local ds_dir="$1" compose_cmd="$2" compose_flags="$3" compose_err="$4" + shift 4 + local ansi_flag cmd service_args + ansi_flag=$(_compose_ansi_flag "$compose_cmd") + cmd="${compose_cmd}" + [[ -n "$ansi_flag" ]] && cmd="${cmd} ${ansi_flag}" + cmd="${cmd} ${compose_flags} up -d" + if [[ "$#" -gt 0 ]]; then + printf -v service_args ' %q' "$@" + cmd="${cmd}${service_args}" + fi + + su - "$DREAM_USER" -c "cd ${ds_dir} && ${cmd}" 2>&1 \ + | tee -a "$LOGFILE" | tee "$compose_err" +} + +_compose_up_with_cpu_heal() { + local ds_dir="$1" compose_cmd="$2" compose_flags="$3" env_file="$4" scope="$5" + shift 5 + local compose_err daemon_ceiling + compose_err=$(mktemp) + + if _compose_up "$ds_dir" "$compose_cmd" "$compose_flags" "$compose_err" "$@"; then + rm -f "$compose_err" + return 0 + fi + + if _compose_output_has_cpu_error "$compose_err"; then + daemon_ceiling=$(_extract_cpu_ceiling_from_compose_error "$compose_err") + if [[ -n "$daemon_ceiling" ]]; then + warn "CPU limit exceeds daemon ceiling (${daemon_ceiling}) during ${scope} — recapping and retrying" + else + warn "CPU limit exceeds host/daemon cores during ${scope} — recapping and retrying" + fi + _apply_host_cpu_caps "$ds_dir" "$env_file" "$daemon_ceiling" "$compose_flags" + if _compose_up "$ds_dir" "$compose_cmd" "$compose_flags" "$compose_err" "$@"; then + rm -f "$compose_err" + return 0 + fi + fi + + local missing_services + missing_services=$(_extract_missing_image_services "$compose_err") + if [[ -n "$missing_services" ]]; then + local missing_list + missing_list="${missing_services//$'\n'/, }" + warn "Compose failed due to missing images for services: ${missing_list}" + local service_output + if ! service_output=$(_compose_list_services "$ds_dir" "$compose_cmd" "$compose_flags"); then + warn "Failed to list compose services after missing-image error (non-fatal)" + else + local -A missing_map=() + local -a filtered_services=() + local service + while IFS= read -r service; do + [[ -n "$service" ]] && missing_map["$service"]=1 + done <<< "$missing_services" + while IFS= read -r service; do + [[ -z "$service" ]] && continue + [[ -n "${missing_map[$service]:-}" ]] && continue + filtered_services+=("$service") + done <<< "$service_output" + + if [[ "${#filtered_services[@]}" -gt 0 ]]; then + if _compose_up_with_flags "$ds_dir" "$compose_cmd" "$compose_flags" "$compose_err" "--no-deps" "${filtered_services[@]}"; then + warn "PARTIAL BRING-UP: started ${#filtered_services[@]} services, skipped (missing images): ${missing_list}" + rm -f "$compose_err" + return 0 + fi + fi + fi + fi + + rm -f "$compose_err" + return 1 +} + +_heal_dashboard_api_proxy() { + local env_file="$1" + local dashboard_port dashboard_api_port dash_status api_status + dashboard_port=$(env_get "$env_file" "DASHBOARD_PORT") + dashboard_port="${dashboard_port:-3001}" + dashboard_api_port=$(env_get "$env_file" "DASHBOARD_API_PORT") + dashboard_api_port="${dashboard_api_port:-3002}" + + dash_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist + api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist + [[ "$dash_status" != "running" || "$api_status" != "running" ]] && return 0 + + if curl -sf --max-time 3 "http://127.0.0.1:${dashboard_api_port}/health" >/dev/null 2>&1 \ + && ! curl -sf --max-time 4 "http://127.0.0.1:${dashboard_port}/api/status" >/dev/null 2>&1; then + warn "Dashboard returned API 502 while dashboard-api is healthy — restarting dashboard to refresh upstream" + # [NON-FATAL: dashboard] Individual service failure does not block others. + docker restart dream-dashboard 2>>"$LOGFILE" || warn "dashboard restart failed (non-fatal)" + fi +} + +# Start DreamServer services via compose +start_services() { + local ds_dir="$1" + local gpu_backend="${2:-auto}" + local env_file="${ds_dir}/.env" + local compose_cmd + compose_cmd=$(get_compose_cmd) + + cd "$ds_dir" || exit 1 + [[ "$gpu_backend" == "auto" ]] && gpu_backend=$(detect_gpu_backend) + if [[ -z "${GPU_BACKEND:-}" ]]; then + GPU_BACKEND="$gpu_backend" + fi + + # Last-resort .env permission guard (fatal if fails — compose cannot start without readable .env) + if [[ -f "$env_file" ]]; then + # Check and fix ownership independently + if [[ "$(stat -c '%U' "$env_file" 2>>"$LOGFILE" || echo root)" != "${DREAM_USER}" ]]; then + chown "${DREAM_USER}:${DREAM_USER}" "$env_file" || { + err ".env ownership fix failed in start_services — Docker Compose cannot start" + exit 1 + } + fi + # Check and fix mode independently + if [[ "$(stat -c '%a' "$env_file" 2>>"$LOGFILE")" != "660" ]]; then + chmod 0660 "$env_file" || { + err ".env chmod to 0660 failed in start_services — Docker Compose cannot start" + exit 1 + } + fi + fi + + local gpu_overlay="docker-compose.${gpu_backend}.yml" + if [[ ! -f "$gpu_overlay" && "$gpu_backend" != "cpu" ]]; then + warn "GPU overlay ${gpu_overlay} not found — falling back to nvidia" + gpu_overlay="docker-compose.nvidia.yml" + fi + + local compose_flags="-f docker-compose.base.yml" + if [[ "$gpu_backend" != "cpu" && -f "$gpu_overlay" ]]; then + compose_flags="${compose_flags} -f ${gpu_overlay}" + fi + + # Prefer upstream compose stack resolver + if [[ -x "${ds_dir}/scripts/resolve-compose-stack.sh" ]]; then + log "Using DreamServer's resolve-compose-stack.sh" + local resolved_flags + resolved_flags=$(su - "$DREAM_USER" -c \ + "cd ${ds_dir} && ./scripts/resolve-compose-stack.sh \ + --gpu-backend ${gpu_backend} --gpu-count ${GPU_COUNT:-1}" 2>&1 || echo "") + if [[ -n "$resolved_flags" ]]; then + compose_flags="$resolved_flags" + fi + fi + + _generate_p2p_gpu_overlay "$ds_dir" + if [[ -f "${ds_dir}/docker-compose.p2p-gpu.yml" ]]; then + compose_flags="${compose_flags} -f docker-compose.p2p-gpu.yml" + fi + + _cleanup_stale_network + _apply_host_cpu_caps "$ds_dir" "$env_file" "" "$compose_flags" + expose_ports_for_vastai "$ds_dir" + + if ! _compose_up_with_cpu_heal "$ds_dir" "$compose_cmd" "$compose_flags" "$env_file" "full compose"; then + warn "Full compose failed — trying core services only" + if ! _compose_up_with_cpu_heal "$ds_dir" "$compose_cmd" "$compose_flags" "$env_file" \ + "core services" llama-server dashboard-api open-webui dashboard; then + warn "Core compose with llama failed — bringing up control plane only" + # [NON-FATAL: compose] Fallback failure still allows manual recovery. + _compose_up_with_cpu_heal "$ds_dir" "$compose_cmd" "$compose_flags" "$env_file" \ + "control-plane services" dashboard-api dashboard open-webui \ + || warn "control-plane compose up also failed (non-fatal)" + fi + fi + + local normalized_ports + normalized_ports=$(_normalize_dashboard_api_port_envs "$env_file") + if [[ -n "$normalized_ports" ]]; then + log "Normalized commented port env values in .env: ${normalized_ports//$'\n'/, }" + # [NON-FATAL: dashboard] Individual service failure does not block others. + docker restart dream-dashboard-api 2>>"$LOGFILE" || warn "dashboard-api restart failed (non-fatal)" + # [NON-FATAL: dashboard] Individual service failure does not block others. + docker restart dream-dashboard 2>>"$LOGFILE" || warn "dashboard restart failed (non-fatal)" + fi + + # If compose exited early, some containers may be left in Created state. + # Try to start them so users can still reach the control plane. + local created + created=$(docker ps -a --filter "status=created" --format '{{.Names}}' | grep '^dream-' || echo "") + if [[ -n "$created" ]]; then + warn "Some containers are still in Created state — attempting docker start" + while IFS= read -r cname; do + [[ -z "$cname" ]] && continue + # [NON-FATAL: service] Individual service failure does not block others. + docker start "$cname" >/dev/null 2>&1 || warn "start ${cname} failed (non-fatal)" + done <<< "$created" + fi + + # Nudge dashboard if stuck in Created state + if docker ps -a --format '{{.Names}} {{.Status}}' 2>&1 | grep -q 'dream-dashboard Created'; then + # [NON-FATAL: dashboard] Individual service failure does not block others. + docker start dream-dashboard || warn "dashboard kick failed (non-fatal)" + log "Kicked dashboard out of Created state" + fi + + _heal_dashboard_api_proxy "$env_file" + _ensure_host_agent_network_binding "$ds_dir" + # [NON-FATAL: host-agent] Agent availability only affects background downloads. + _ensure_host_agent_running "$ds_dir" || warn "Host agent unavailable - model downloads may fail until agent is started manually" + # [NON-FATAL: opencode] Optional service; failures do not block others. + _ensure_opencode_web_running "$ds_dir" || warn "OpenCode web unavailable (non-fatal)" +} diff --git a/dream-server/installers/p2p-gpu/phases/00-preflight.sh b/dream-server/installers/p2p-gpu/phases/00-preflight.sh new file mode 100644 index 000000000..4df809593 --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/00-preflight.sh @@ -0,0 +1,252 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Phase 00: Preflight Checks +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/phases/ +# Purpose: GPU detection (NVIDIA/AMD/CPU), disk/Docker/DNS validation, +# nvidia-container-toolkit setup +# +# Expects: MIN_DISK_GB, MIN_VRAM_MB, LOGFILE, log(), warn(), err(), +# find_dream_dir(), get_compose_cmd(), detect_gpu() +# Provides: GPU_BACKEND, GPU_NAME, GPU_VRAM, GPU_COUNT, CPU_COUNT, +# DISK_AVAIL_GB (all exported for later phases) +# +# Fixes covered: #12 (NVIDIA toolkit), #13 (disk space), #14 (compose v1), +# #17 (DNS), #27 (AMD GPU), #28 (CPU-only fallback) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 0/12: Preflight checks" + +TLS_OK="true" + +# Must be root +if [[ $EUID -ne 0 ]]; then + err "This script must be run as root. Run: sudo bash ${SCRIPT_NAME}" + exit 1 +fi + +# ── [FIX: gpu-dedup] Use single detect_gpu() function ────────────────────── +detect_gpu + +case "$GPU_BACKEND" in + nvidia) log "NVIDIA GPU: ${GPU_NAME} × ${GPU_COUNT} (${GPU_VRAM} MiB VRAM each)" ;; + amd) log "AMD GPU: ${GPU_NAME} × ${GPU_COUNT} (${GPU_VRAM} MiB VRAM)" ;; + cpu) warn "No GPU detected — running in CPU-only mode (slower but functional)" ;; +esac + +# Multi-GPU enumeration +if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then + enumerate_gpus + log "Multi-GPU: ${GPU_COUNT} GPUs, total VRAM: ${GPU_TOTAL_VRAM} MiB" + for i in "${!GPU_UUIDS[@]}"; do + log " GPU[${i}]: ${GPU_NAMES[$i]} (${GPU_VRAMS[$i]} MiB) ${GPU_UUIDS[$i]}" + done +fi + +CPU_COUNT=$(nproc) +DISK_AVAIL_GB=$(df -BG --output=avail / 2>&1 | tail -1 | tr -dc '0-9') +log "GPU backend: ${GPU_BACKEND} | CPUs: ${CPU_COUNT} | Disk: ${DISK_AVAIL_GB} GB" + +# VRAM check +if [[ "$GPU_BACKEND" != "cpu" && "${GPU_VRAM:-0}" -lt "$MIN_VRAM_MB" ]]; then + warn "GPU VRAM (${GPU_VRAM} MiB) below recommended (${MIN_VRAM_MB} MiB) — small models only" +fi + +# ── Disk space ────────────────────────────────────────────────────────────── +_check_disk_space() { + local existing_install + existing_install=$(find_dream_dir 2>&1 || echo "") + if [[ "${DISK_AVAIL_GB:-0}" -lt "$MIN_DISK_GB" ]]; then + if [[ -n "$existing_install" && -f "${existing_install}/.env" ]]; then + warn "Disk (${DISK_AVAIL_GB} GB) below ${MIN_DISK_GB} GB, but DreamServer already installed" + else + err "Disk space (${DISK_AVAIL_GB} GB) below minimum (${MIN_DISK_GB} GB)." + err "DreamServer needs 40+ GB. Create a Vast.ai instance with more disk." + exit 1 + fi + fi +} +_check_disk_space + +# ── Docker ────────────────────────────────────────────────────────────────── +if ! command -v docker &>/dev/null; then + err "Docker not found. Use a Vast.ai image with Docker pre-installed." + exit 1 +fi + +COMPOSE_CMD=$(get_compose_cmd) +compose_version="unknown" +case "$COMPOSE_CMD" in + "docker compose") + compose_version=$(docker compose version --short 2>&1 || echo "unknown") + ;; + "docker-compose") + compose_version=$(docker-compose version --short 2>&1 || echo "unknown") + ;; +esac +log "Docker Compose: ${COMPOSE_CMD} (${compose_version})" + +# ── GPU passthrough verification ──────────────────────────────────────────── +_verify_nvidia_passthrough() { + local gpu_test_image="nvidia/cuda:12.4.1-base-ubuntu22.04" + local passthrough_timeout="${NVIDIA_DOCKER_TEST_TIMEOUT:-180}" + local probe_rc=0 + + log "Verifying NVIDIA Docker passthrough (timeout ${passthrough_timeout}s; first run may pull ${gpu_test_image})" + if timeout --signal=TERM "${passthrough_timeout}" \ + docker run --rm --gpus all "${gpu_test_image}" nvidia-smi &>/dev/null; then + log "NVIDIA Docker passthrough verified" + + # ── [FIX: nvml-mismatch] Detect and repair driver/library mismatch ──────── + log "Checking for NVIDIA driver/library version misalignment..." + if detect_nvml_mismatch "${gpu_test_image}"; then + : + else + mismatch_status=$? + if [[ $mismatch_status -eq 1 ]]; then + warn "NVIDIA driver/library mismatch detected — attempting repair" + if ! repair_nvml_mismatch; then + warn "NVIDIA driver mismatch repair did not complete (non-fatal)" + fi + fi + fi + + return 0 + else + probe_rc=$? + fi + + if [[ "$probe_rc" -eq 124 ]]; then + warn "NVIDIA GPU passthrough probe timed out after ${passthrough_timeout}s — checking toolkit..." + else + warn "NVIDIA GPU passthrough test failed (exit ${probe_rc}) — checking toolkit..." + fi + + if [[ "$probe_rc" -ne 0 ]]; then + if ! dpkg -l nvidia-container-toolkit &>/dev/null; then + warn "nvidia-container-toolkit not installed — attempting install" + + # [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout. + _wait_for_dpkg_lock 60 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle" + + local keyring="/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg" + # [NON-FATAL: repo] Transient GPG/keyring failures should not halt install. + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor --batch --yes --output "$keyring" 2>>"$LOGFILE" \ + || warn "gpg key import failed (non-fatal)" + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null + apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" update -qq 2>>"$LOGFILE" \ + && apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-120}" install -y -qq nvidia-container-toolkit 2>>"$LOGFILE" + # [NON-FATAL: nvidia-ctk] Toolkit may already be configured or unavailable. + nvidia-ctk runtime configure --runtime=docker 2>>"$LOGFILE" || warn "nvidia-ctk configure failed (non-fatal)" + # [NON-FATAL: docker] Docker may not be managed by systemctl on Vast.ai. + systemctl restart docker 2>>"$LOGFILE" || service docker restart 2>>"$LOGFILE" \ + || warn "docker restart failed (non-fatal)" + log "nvidia-container-toolkit installed and configured" + + # ── [FIX: nvml-mismatch] Re-check after toolkit install ────────────── + log "Re-checking for NVIDIA driver/library mismatch after toolkit install..." + if detect_nvml_mismatch "${gpu_test_image}"; then + : + else + mismatch_status=$? + if [[ $mismatch_status -eq 1 ]]; then + warn "NVIDIA driver/library mismatch detected — attempting repair" + if ! repair_nvml_mismatch; then + warn "NVIDIA driver mismatch repair did not complete (non-fatal)" + fi + fi + fi + fi + fi +} + +_verify_amd_passthrough() { + [[ ! -e /dev/kfd ]] && warn "/dev/kfd not found — AMD GPU may not be container-accessible" + [[ ! -d /dev/dri ]] && warn "/dev/dri not found — AMD GPU rendering may not work" + if docker run --rm --device=/dev/kfd --device=/dev/dri rocm/rocm-terminal:latest rocm-smi &>/dev/null; then + log "AMD ROCm Docker passthrough verified" + else + warn "AMD ROCm Docker test failed — GPU may need driver configuration" + fi +} + +[[ "$GPU_BACKEND" == "nvidia" ]] && _verify_nvidia_passthrough +[[ "$GPU_BACKEND" == "amd" ]] && _verify_amd_passthrough + +# Re-detect GPU if initial detection returned cpu but nvidia-smi works now +# (can happen after nvidia-container-toolkit install or stale state from previous run) +if [[ "$GPU_BACKEND" == "cpu" ]] && command -v nvidia-smi &>/dev/null \ + && nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null 2>&1; then + log "Re-running GPU detection after toolkit install..." + detect_gpu + if [[ "$GPU_BACKEND" != "cpu" ]]; then + log "GPU detected on retry: ${GPU_NAME} × ${GPU_COUNT} (${GPU_VRAM} MiB VRAM each)" + fi +fi + +# ── DNS fix ───────────────────────────────────────────────────────────────── +if ! host github.com &>/dev/null && ! nslookup github.com &>/dev/null; then + if ! curl -sf --max-time 5 https://github.com > /dev/null; then + warn "DNS resolution broken — adding Google DNS as fallback" + if ! grep -q '8.8.8.8' /etc/resolv.conf; then + echo "nameserver 8.8.8.8" >> /etc/resolv.conf + echo "nameserver 1.1.1.1" >> /etc/resolv.conf + fi + fi +fi + +# ── HTTPS trust (proxy CA) ───────────────────────────────────────────────── +_verify_https_trust() { + local urls=( + "https://huggingface.co" + "https://registry-1.docker.io/v2/" + ) + local failed=false + + if ! command -v curl &>/dev/null; then + warn "curl not found — skipping HTTPS trust check" + return 0 + fi + + for url in "${urls[@]}"; do + if curl -fsI --max-time 10 "$url" > /dev/null 2>>"$LOGFILE"; then + continue + fi + local rc=$? + if [[ "$rc" -eq 60 ]]; then + warn "HTTPS trust failure when contacting ${url} (curl exit 60)" + failed=true + else + warn "HTTPS check failed for ${url} (curl exit ${rc})" + fi + done + + if [[ "$failed" == "true" ]]; then + TLS_OK="false" + warn "System TLS trust is broken — model downloads and Docker pulls will fail" + warn "If behind a proxy, install the proxy root CA, then run:" + warn " cp /path/to/proxy-root.crt /usr/local/share/ca-certificates/proxy-root.crt" + warn " update-ca-certificates --fresh" + warn " systemctl restart docker" + fi +} + +_verify_https_trust + +# ── /tmp permissions fix ──────────────────────────────────────────────────── +if [[ "$(stat -c '%a' /tmp)" != "1777" ]]; then + chown root:root /tmp + chmod 1777 /tmp + log "/tmp permissions fixed (was broken)" +else + log "/tmp permissions OK" +fi + +log "All preflight checks passed" diff --git a/dream-server/installers/p2p-gpu/phases/01-dependencies.sh b/dream-server/installers/p2p-gpu/phases/01-dependencies.sh new file mode 100644 index 000000000..5cda11a4c --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/01-dependencies.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 01: System Dependencies +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Install missing packages (git, curl, jq, aria2, acl, python3-yaml) +# +# Expects: LOGFILE, log() +# Provides: All required CLI tools available in PATH +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 1/12: Installing system dependencies" + +pkgs_needed=() +for pkg in sudo git curl jq wget openssl aria2 procps iproute2 acl python3-yaml; do + # python3-yaml is a library, check via python3 import + if [[ "$pkg" == "python3-yaml" ]]; then + python3 -c "import yaml" 2>&1 || pkgs_needed+=("$pkg") + continue + fi + command -v "$pkg" &>/dev/null || pkgs_needed+=("$pkg") +done +# ss is part of iproute2 +command -v ss &>/dev/null || pkgs_needed+=("iproute2") + +# Vast.ai instances often ship with stale PPAs (e.g. graphics-drivers) that +# timeout during apt-get update and cause hard failures under set -e. +# The GPU driver is already installed — these PPAs are not needed. +for stale_ppa in graphics-drivers; do + if ls /etc/apt/sources.list.d/${stale_ppa}* &>/dev/null; then + rm -f /etc/apt/sources.list.d/${stale_ppa}* + log "Removed stale PPA: ${stale_ppa} (not needed — driver already installed)" + fi +done + +# unattended-upgrades can hold the dpkg lock for minutes on fresh Vast.ai +# instances. We rely on DPk::Lock::Timeout below, but if the lock is clearly +# stuck, kill only unattended-upgrades (the typical culprit). +# [NON-FATAL: dpkg] apt will still enforce DPkg::Lock::Timeout. +_wait_for_dpkg_lock 90 || warn "dpkg lock not released in time — DPkg::Lock::Timeout will handle" + +# Disable unattended-upgrades permanently — it causes NVML mismatches +# and dpkg lock contention on GPU instances +if systemctl is-enabled unattended-upgrades &>/dev/null; then # stderr expected: service check + # [NON-FATAL: systemd] Unattended-upgrades may not be managed on this host. + systemctl disable unattended-upgrades 2>>"$LOGFILE" || warn "Could not disable unattended-upgrades (non-fatal)" + # [NON-FATAL: systemd] Unattended-upgrades may not be managed on this host. + systemctl mask unattended-upgrades 2>>"$LOGFILE" || warn "Could not mask unattended-upgrades (non-fatal)" + log "Disabled unattended-upgrades (prevents NVIDIA driver/library mismatches)" +fi + +if [[ ${#pkgs_needed[@]} -gt 0 ]]; then + # unattended-upgrades may briefly hold dpkg lock on fresh hosts. + apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-300}" update -qq 2>>"$LOGFILE" + apt-get -o DPkg::Lock::Timeout="${APT_LOCK_TIMEOUT:-300}" install -y -qq "${pkgs_needed[@]}" 2>>"$LOGFILE" + log "Installed: ${pkgs_needed[*]}" +else + log "All dependencies already present" +fi diff --git a/dream-server/installers/p2p-gpu/phases/02-user-setup.sh b/dream-server/installers/p2p-gpu/phases/02-user-setup.sh new file mode 100644 index 000000000..63d7ff483 --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/02-user-setup.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 02: User Setup +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Create dream user, configure sudo/docker group, copy SSH keys +# +# Expects: DREAM_USER, DREAM_HOME, log(), warn() +# Provides: Non-root 'dream' user ready for DreamServer install +# +# Fixes covered: #01 (root user rejection), #02 (Docker socket denied) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 2/12: Creating user '${DREAM_USER}'" + +if id -u "$DREAM_USER" &>/dev/null; then + log "User '${DREAM_USER}' already exists" +else + useradd -m -s /bin/bash -u 1000 "$DREAM_USER" 2>&1 || \ + useradd -m -s /bin/bash "$DREAM_USER" + log "User '${DREAM_USER}' created" +fi + +# Sudo access +# [NON-FATAL: permissions] Sudo group add is convenience; install can proceed. +usermod -aG sudo "$DREAM_USER" || warn "sudo group add failed (non-fatal)" +echo "${DREAM_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-dream +chmod 440 /etc/sudoers.d/90-dream + +# Docker group +if getent group docker &>/dev/null; then + usermod -aG docker "$DREAM_USER" + log "Added ${DREAM_USER} to docker group" +fi + +# Copy SSH keys for direct user access +if [[ -d /root/.ssh && ! -d "${DREAM_HOME}/.ssh" ]]; then + cp -r /root/.ssh "${DREAM_HOME}/.ssh" + chown -R "${DREAM_USER}:${DREAM_USER}" "${DREAM_HOME}/.ssh" + chmod 700 "${DREAM_HOME}/.ssh" + find "${DREAM_HOME}/.ssh" -type f -exec chmod 600 {} + +fi + +log "User configured" diff --git a/dream-server/installers/p2p-gpu/phases/03-repository.sh b/dream-server/installers/p2p-gpu/phases/03-repository.sh new file mode 100644 index 000000000..8633214ce --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/03-repository.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 03: Repository Setup +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Clone DreamServer repo or locate existing checkout +# +# Expects: DREAM_USER, DREAM_HOME, REPO_URL, REPO_BRANCH, +# log(), warn(), fix_ownership() +# Provides: REPO_DIR (path to cloned repository) +# +# Fixes covered: #09 (dual directory confusion) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 3/12: Setting up DreamServer repository" + +REPO_DIR="${DREAM_HOME}/DreamServer" + +if [[ -d "${REPO_DIR}/.git" ]]; then + log "Repository already exists at ${REPO_DIR}" + su - "$DREAM_USER" -c "cd ${REPO_DIR} && git pull --ff-only" 2>&1 || \ + warn "Could not pull latest (non-fatal — using existing checkout)" +else + # Check alternate locations (some Vast.ai onstart scripts pre-clone) + found_repo="" + for candidate in /root/DreamServer /workspace/DreamServer /opt/DreamServer; do + if [[ -d "${candidate}/.git" ]]; then + found_repo="$candidate" + break + fi + done + + if [[ -n "$found_repo" ]]; then + mv "$found_repo" "$REPO_DIR" + log "Moved repository from ${found_repo}" + else + su - "$DREAM_USER" -c "git clone --depth 1 --branch ${REPO_BRANCH} ${REPO_URL} ${REPO_DIR}" + log "Cloned DreamServer (shallow, branch: ${REPO_BRANCH})" + fi +fi + +fix_ownership "$REPO_DIR" "$DREAM_USER" diff --git a/dream-server/installers/p2p-gpu/phases/04-installer.sh b/dream-server/installers/p2p-gpu/phases/04-installer.sh new file mode 100644 index 000000000..203186ceb --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/04-installer.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 04: Run Upstream Installer +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Execute DreamServer's install.sh with timeout protection +# +# Expects: REPO_DIR, DREAM_USER, INSTALLER_TIMEOUT, GPU_BACKEND, GPU_VRAM, +# GPU_COUNT, log(), warn(), err() +# Provides: DreamServer installed (may be partial if timeout hit) +# +# Fixes covered: #25 (ComfyUI infinite hang), #26 (installer timeout) +# +# Modder notes: +# Timeout is non-fatal. Heavy services (ComfyUI, Whisper) download in +# background and are handled by later phases. We only cap the installer +# wait loop, not the actual containers. +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 4/12: Running DreamServer installer" + +warn "Running installer (${INSTALLER_TIMEOUT}s timeout)..." +warn "Heavy services (ComfyUI, Whisper, etc.) will continue after timeout." + +install_exit=0 +installer_pid="" + +# Map detected VRAM to upstream installer tier system so non-interactive +# installs on GPU hosts don't fall through to CPU-tier model selection. +# Hard-fail philosophy: if GPU_BACKEND is nvidia but VRAM is unknown/zero, +# we let the installer auto-detect rather than passing a wrong tier. +installer_tier_arg="" +if [[ "$GPU_BACKEND" == "nvidia" && "${GPU_VRAM:-0}" -gt 0 ]]; then + if [[ "$GPU_VRAM" -ge 40000 ]]; then installer_tier_arg="--tier 4" + elif [[ "$GPU_VRAM" -ge 20000 ]]; then installer_tier_arg="--tier 3" + elif [[ "$GPU_VRAM" -ge 12000 ]]; then installer_tier_arg="--tier 2" + else installer_tier_arg="--tier 1" + fi + log "Passing ${installer_tier_arg} to installer (GPU_VRAM=${GPU_VRAM} MiB)" +fi + +# CDI containers can expose /dev/nvidia* without DRM vendor sysfs. Provide a +# minimal sysfs override for the installer's detection phase when needed. +drm_sys_override="" +if [[ "$GPU_BACKEND" == "nvidia" && ( -e /dev/nvidiactl || -e /dev/nvidia0 ) ]]; then + has_drm_vendor=false + for vendor_path in /sys/class/drm/card*/device/vendor; do + if [[ -e "$vendor_path" ]]; then + has_drm_vendor=true + break + fi + done + if [[ "$has_drm_vendor" == "false" ]]; then + drm_sys_override="${TMPDIR:-/tmp}/dream-drm-sys" + mkdir -p "${drm_sys_override}/card0/device" + printf '0x10de\n' > "${drm_sys_override}/card0/device/vendor" + log "Providing DRM sysfs override at ${drm_sys_override} for containerized NVIDIA detection" + fi +fi + +# sudo -E -u preserves GPU_BACKEND/GPU_VRAM/GPU_COUNT for the installer's +# detection phase. The previous `su -` was a login shell and stripped them, +# causing the installer to re-run its own (sysfs-based) detection which +# fails on Vast.ai / RunPod / any CDI-based GPU container. +sudo -E -u "$DREAM_USER" \ + env HOME="${DREAM_HOME}" \ + GPU_BACKEND="$GPU_BACKEND" \ + GPU_VRAM="${GPU_VRAM:-0}" \ + GPU_COUNT="${GPU_COUNT:-1}" \ + GPU_NAME="${GPU_NAME:-unknown}" \ + DREAM_DRM_SYS="${drm_sys_override:-}" \ + bash -c "cd ${REPO_DIR} && ./install.sh --non-interactive ${installer_tier_arg}" & +installer_pid=$! + +waited=0 +while kill -0 "$installer_pid" 2>/dev/null; do # stderr expected: process may exit between checks + if [[ $waited -ge $INSTALLER_TIMEOUT ]]; then + warn "Installer reached ${INSTALLER_TIMEOUT}s limit — proceeding with setup" + # [NON-FATAL: cleanup] Installer may have exited before TERM. + kill -TERM "$installer_pid" 2>>"$LOGFILE" || warn "could not TERM installer (non-fatal)" + sleep 2 + if kill -0 "$installer_pid" 2>>"$LOGFILE"; then + # [NON-FATAL: cleanup] Installer may have exited before KILL. + kill -9 "$installer_pid" 2>>"$LOGFILE" || warn "could not KILL installer (non-fatal)" + fi + # Child processes of the installer should die with their parent. + # No pkill -f needed — TERM/KILL on the parent suffices. + install_exit=124 + break + fi + sleep 5 + waited=$((waited + 5)) + (( waited % 60 == 0 )) && log "Installer running... (${waited}s / ${INSTALLER_TIMEOUT}s max)" +done + +if [[ $install_exit -ne 124 ]]; then + wait "$installer_pid" 2>>"$LOGFILE" || install_exit=$? +fi + +if [[ $install_exit -eq 0 ]]; then + log "DreamServer installer completed successfully" +elif [[ $install_exit -eq 124 ]]; then + log "Installer timed out (normal for heavy services) — continuing" +else + warn "Installer exited with code ${install_exit} — applying fixes and continuing" +fi diff --git a/dream-server/installers/p2p-gpu/phases/05-post-install.sh b/dream-server/installers/p2p-gpu/phases/05-post-install.sh new file mode 100644 index 000000000..4b39d6cbe --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/05-post-install.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 05: Post-Install Fixes +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Locate active dream-server directory, apply all post-install fixes +# +# Expects: DREAM_HOME, REPO_DIR, GPU_BACKEND, DREAM_USER, +# log(), warn(), err(), find_dream_dir(), fix_ownership(), +# apply_post_install_fixes() +# Provides: DS_DIR (active dream-server path) +# +# Fixes covered: #03 (/tmp), #04 (CPU overflow), #05 (n8n uid), #06 (dashboard-api), +# #07 (comfyui write), #08 (WEBUI_SECRET), #15 (.env dupes) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 5/12: Locating directory & applying fixes" + +DS_DIR=$(find_dream_dir) || { + err "Could not find dream-server directory after install" + err "Expected at: ${DREAM_HOME}/dream-server or ${REPO_DIR}/dream-server" + exit 1 +} + +log "Active directory: ${DS_DIR}" +fix_ownership "$DS_DIR" "$DREAM_USER" + +apply_post_install_fixes "$DS_DIR" "$GPU_BACKEND" + +# Fix secondary directory if dual-install occurred +alt_dir="" +if [[ "$DS_DIR" == "${DREAM_HOME}/dream-server" && -d "${REPO_DIR}/dream-server" ]]; then + alt_dir="${REPO_DIR}/dream-server" +elif [[ "$DS_DIR" == "${REPO_DIR}/dream-server" && -d "${DREAM_HOME}/dream-server" ]]; then + alt_dir="${DREAM_HOME}/dream-server" +fi + +if [[ -n "$alt_dir" && -f "${alt_dir}/.env" ]]; then + apply_post_install_fixes "$alt_dir" "$GPU_BACKEND" + log "Also fixed secondary directory: ${alt_dir}" +fi + +# Cap llama-server context based on GPU VRAM budget +_cap_context_for_vram "$DS_DIR" + +# -- Ensure data/persona/SOUL.md exists ------------------------------------ +# Hermes compose bind-mounts this file. If missing, Docker creates it as a +# directory -> container crashes with "not a directory" error. +_ensure_persona_file() { + local ds_dir="$1" + local persona_file="${ds_dir}/data/persona/SOUL.md" + local template="${ds_dir}/extensions/services/hermes/SOUL.md.template" + + if [[ -f "$persona_file" ]]; then + return 0 + fi + + mkdir -p "${ds_dir}/data/persona" + + # If Docker already created it as a directory, remove it + if [[ -d "$persona_file" ]]; then + log "Removing Docker-created directory at ${persona_file}" + # [NON-FATAL: cleanup] Best-effort cleanup; template fallback still works. + rm -rf "$persona_file" 2>>"$LOGFILE" || warn "Could not remove directory at ${persona_file} (non-fatal)" + fi + + # Try rendering via upstream script first + local context_script="${ds_dir}/scripts/build-installation-context.py" + if [[ -x "$context_script" ]] && command -v python3 &>/dev/null; then + if su - "$DREAM_USER" -c "cd ${ds_dir} && python3 scripts/build-installation-context.py" \ + >> "$LOGFILE" 2>&1; then + if [[ -f "$persona_file" ]]; then + log "Persona file rendered via build-installation-context.py" + return 0 + fi + else + warn "build-installation-context.py failed (non-fatal) - using template" + fi + fi + + # Fallback: copy template directly + if [[ -f "$template" ]]; then + cp "$template" "$persona_file" + chown "${DREAM_USER}:${DREAM_USER}" "$persona_file" + log "Persona file created from template at ${persona_file}" + else + # Last resort: create minimal placeholder so the mount does not fail + cat > "$persona_file" << 'SOUL_EOF' +# DreamServer Persona +You are Dream, a helpful AI assistant powered by DreamServer. +SOUL_EOF + chown "${DREAM_USER}:${DREAM_USER}" "$persona_file" + log "Minimal persona placeholder created at ${persona_file}" + fi + + # Final verification - if still not a regular file, something is wrong + if [[ ! -f "$persona_file" ]]; then + warn "SOUL.md is still not a regular file at ${persona_file} - hermes container will fail to mount" + warn "Manual fix: rm -rf ${persona_file} && cp ${template} ${persona_file}" + fi +} + +_ensure_persona_file "$DS_DIR" + +# Ensure llama-server config mount points are regular files, not Docker-created directories +_ensure_mount_files() { + local ds_dir="$1" + local models_ini="${ds_dir}/config/llama-server/models.ini" + + # models.ini - llama-server bind mount + if [[ -d "$models_ini" ]]; then + log "Removing Docker-created directory at ${models_ini}" + rm -rf "$models_ini" + fi + if [[ ! -f "$models_ini" ]]; then + mkdir -p "${ds_dir}/config/llama-server" + touch "$models_ini" + chown "${DREAM_USER}:${DREAM_USER}" "$models_ini" + log "Created empty ${models_ini}" + fi +} + +_ensure_mount_files "$DS_DIR" diff --git a/dream-server/installers/p2p-gpu/phases/06-bootstrap-model.sh b/dream-server/installers/p2p-gpu/phases/06-bootstrap-model.sh new file mode 100644 index 000000000..9007a1d0d --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/06-bootstrap-model.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Phase 06: Bootstrap Model +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/phases/ +# Purpose: Ensure a usable GGUF model file exists so llama-server can start. +# If the GPU can handle a bigger model, download it in the background +# and hot-swap once ready (zero downtime). +# +# Expects: DS_DIR, GPU_BACKEND, GPU_VRAM, GPU_COUNT, +# log(), warn(), err(), env_get(), env_set(), +# fix_known_uid_requirements(), apply_data_acl(), +# check_disk_for_download(), resolve_model_url(), +# resolve_tier_for_gpu(), _store_pid(), create_model_swap_watcher() +# Provides: Verified GGUF_FILE in .env pointing to a real model; +# background download of tier model + swap watcher (if bootstrapped) +# +# Fixes covered: #19 (bootstrap model missing), #20 (llama-server hang) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 6/12: Ensuring bootstrap model is available" + +# Derive LLM_MODEL identifier from GGUF filename. +# Strips .gguf extension and quantization suffix, lowercases. +# Example: Qwen3-30B-A3B-Q4_K_M.gguf -> qwen3-30b-a3b +_derive_llm_model() { + echo "$1" \ + | sed -E 's/\.(gguf|GGUF)$//' \ + | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' \ + | tr '[:upper:]' '[:lower:]' +} + +env_file="${DS_DIR}/.env" +data_dir="${DS_DIR}/data" +models_dir="${data_dir}/models" +mkdir -p "$models_dir" + +model_ready=false + +# ── Step 1: Resolve the GPU-optimal tier model ──────────────────────────────── +# This is the model the GPU *should* run. We determine it from VRAM, not from +# whatever the installer may or may not have written to .env. +resolve_tier_for_gpu "$DS_DIR" "$GPU_BACKEND" "${GPU_VRAM:-0}" "${GPU_COUNT:-1}" +tier_gguf="${TIER_GGUF_FILE}" +tier_url="${TIER_GGUF_URL}" +tier_size_mb="${TIER_MODEL_SIZE_MB}" + +# Persist model size for VRAM budget calculations in later phases +if [[ "${TIER_MODEL_SIZE_MB:-0}" -gt 0 ]]; then + env_set "$env_file" "LLM_MODEL_SIZE_MB" "$TIER_MODEL_SIZE_MB" +fi + +if [[ -n "$tier_gguf" ]]; then + log "GPU-optimal model for ${GPU_BACKEND} (${GPU_VRAM:-0}MB VRAM): ${tier_gguf} (~${tier_size_mb}MB)" +else + warn "Could not determine tier model — will use bootstrap model only" +fi + +# ── Step 2: Check if we already have a usable model ────────────────────────── + +# Check if the tier model itself is already downloaded +if [[ -n "$tier_gguf" && -f "${models_dir}/${tier_gguf}" ]]; then + file_size=$(stat -c%s "${models_dir}/${tier_gguf}" || echo 0) + if [[ $file_size -gt 100000000 ]]; then + env_set "$env_file" "GGUF_FILE" "$tier_gguf" + env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$tier_gguf")" + model_ready=true + log "Tier model already present: ${tier_gguf} ($(( file_size / 1048576 )) MB)" + else + warn "Tier model exists but too small (${file_size} bytes) — likely corrupt" + rm -f "${models_dir}/${tier_gguf}" + fi +fi + +# Check configured GGUF_FILE from .env +if [[ "$model_ready" != "true" ]]; then + gguf_file=$(env_get "$env_file" "GGUF_FILE") + if [[ -n "$gguf_file" && -f "${models_dir}/${gguf_file}" ]]; then + file_size=$(stat -c%s "${models_dir}/${gguf_file}" || echo 0) + if [[ $file_size -gt 100000000 ]]; then + model_ready=true + log "Model verified: ${gguf_file} ($(( file_size / 1048576 )) MB)" + if [[ -z "$(env_get "$env_file" "LLM_MODEL")" ]]; then + env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$gguf_file")" + fi + else + warn "Model file exists but too small (${file_size} bytes) — likely corrupt" + rm -f "${models_dir}/${gguf_file}" + fi + fi +fi + +# Check for ANY .gguf file as fallback +if [[ "$model_ready" != "true" ]]; then + any_model=$(find "$models_dir" -name "*.gguf" -size +100M 2>>"$LOGFILE" | head -1 || echo "") + if [[ -n "$any_model" ]]; then + found_name=$(basename "$any_model") + env_set "$env_file" "GGUF_FILE" "$found_name" + env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$found_name")" + model_ready=true + log "Found existing model: ${found_name} — updated GGUF_FILE" + fi +fi + +# ── Step 3: Download bootstrap model if nothing usable exists ───────────────── +if [[ "$model_ready" != "true" ]]; then + # [FIX: disk-check] Verify disk space before downloading + if ! check_disk_for_download "$models_dir" 2; then + err "Cannot download bootstrap model — insufficient disk space" + warn "Continuing without a model — llama-server will not start" + else + if [[ "${TLS_OK:-true}" != "true" ]]; then + warn "Skipping bootstrap download because TLS trust is broken (TLS_OK=false)" + warn "Fix TLS trust (proxy root CA) and re-run setup to download models" + else + warn "No usable model found — downloading bootstrap model..." + bootstrap_url="https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf" + bootstrap_name="Qwen3-0.6B-Q4_K_M.gguf" + + if command -v aria2c &>/dev/null; then + set +e + aria2c -x 8 -s 8 -k 5M --file-allocation=none --console-log-level=notice \ + --check-integrity=true \ + -d "$models_dir" -o "$bootstrap_name" "$bootstrap_url" 2>&1 | tail -5 + dl_rc=${PIPESTATUS[0]} + set -e + if [[ "$dl_rc" -ne 0 ]]; then + warn "Bootstrap download failed (aria2c exit ${dl_rc}) — check TLS/proxy CA" + fi + else + set +e + curl -L --fail --progress-bar -o "${models_dir}/${bootstrap_name}" "$bootstrap_url" + dl_rc=$? + set -e + if [[ "$dl_rc" -ne 0 ]]; then + warn "Bootstrap download failed (curl exit ${dl_rc}) — check TLS/proxy CA" + fi + fi + + # [FIX: bootstrap-size] Validate downloaded file size (>50MB for smallest GGUF) + if [[ -f "${models_dir}/${bootstrap_name}" ]]; then + dl_size=$(stat -c%s "${models_dir}/${bootstrap_name}" || echo 0) + if [[ "$dl_size" -gt 50000000 ]]; then + env_set "$env_file" "GGUF_FILE" "$bootstrap_name" + env_set "$env_file" "LLM_MODEL" "$(_derive_llm_model "$bootstrap_name")" + model_ready=true + log "Bootstrap model downloaded: ${bootstrap_name} ($(( dl_size / 1048576 )) MB)" + else + err "Downloaded model too small (${dl_size} bytes) — likely incomplete or corrupt" + rm -f "${models_dir}/${bootstrap_name}" + warn "Continuing without a model — llama-server will not start" + fi + else + err "Failed to download bootstrap model — llama-server will not start" + warn "Continuing anyway — other services may still work" + fi + fi + fi +fi + +# ── Step 4: Queue background download of tier model if needed ───────────────── +# If we're running a smaller model than what the GPU can handle, download the +# tier model in the background. The swap watcher will hot-swap GGUF_FILE and +# recreate llama-server via `docker compose up -d` once the download completes. +current_gguf=$(env_get "$env_file" "GGUF_FILE") +if [[ "${TLS_OK:-true}" != "true" ]]; then + warn "Skipping tier model download because TLS trust is broken (TLS_OK=false)" +elif [[ -n "$tier_gguf" && "$tier_gguf" != "${current_gguf:-}" ]]; then + # Determine disk space needed (model size in MB → GB, rounded up + 2GB buffer) + needed_gb=$(( (tier_size_mb / 1024) + 2 )) + [[ $needed_gb -lt 5 ]] && needed_gb=5 + + if check_disk_for_download "$models_dir" "$needed_gb"; then + # Resolve URL: prefer TIER_GGUF_URL from tier resolution, fallback to resolve_model_url + if [[ -z "$tier_url" ]]; then + tier_url=$(resolve_model_url "$DS_DIR" "$tier_gguf") || tier_url="" + fi + + if [[ -n "$tier_url" ]]; then + log "Queuing background download: ${tier_gguf} (~${tier_size_mb}MB)" + log " URL: ${tier_url}" + log " Current model: ${current_gguf:-none}" + log " Once complete, llama-server will auto-swap to the bigger model" + mkdir -p "${DS_DIR}/logs" + + if command -v aria2c &>/dev/null; then + nohup aria2c \ + -x 8 -s 8 -k 10M \ + --continue=true \ + --max-tries=0 \ + --retry-wait=5 \ + --timeout=60 \ + --connect-timeout=30 \ + --file-allocation=none \ + --auto-file-renaming=false \ + --console-log-level=warn \ + --summary-interval=30 \ + --check-integrity=true \ + -d "$models_dir" \ + -o "$tier_gguf" \ + "$tier_url" \ + >> "${DS_DIR}/logs/aria2c-download.log" 2>&1 & + else + nohup curl -L --fail -o "${models_dir}/${tier_gguf}" "$tier_url" \ + >> "${DS_DIR}/logs/aria2c-download.log" 2>&1 & + fi + + dl_pid=$! + _store_pid "aria2c-model" "$dl_pid" + log "Background download started (PID: ${dl_pid})" + create_model_swap_watcher "$DS_DIR" "$tier_gguf" + else + warn "Could not resolve download URL for ${tier_gguf} — staying on ${current_gguf:-bootstrap model}" + fi + else + warn "Insufficient disk for tier model (~${tier_size_mb}MB) — staying on ${current_gguf:-bootstrap model}" + fi +elif [[ -n "$tier_gguf" && "$tier_gguf" == "${current_gguf:-}" ]]; then + log "Already running the GPU-optimal model: ${tier_gguf}" +fi + +fix_known_uid_requirements "$data_dir" "$GPU_BACKEND" +apply_data_acl "$models_dir" + +# Re-run VRAM context cap now that we know the actual model size +_cap_context_for_vram "$DS_DIR" diff --git a/dream-server/installers/p2p-gpu/phases/07-model-optimize.sh b/dream-server/installers/p2p-gpu/phases/07-model-optimize.sh new file mode 100644 index 000000000..e4eba704c --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/07-model-optimize.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 07: Model Download Optimization +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Resume incomplete downloads with aria2c multi-threaded transfer, +# start model swap watcher +# +# Expects: DS_DIR, log(), optimize_model_download() +# Provides: Background aria2c download + model swap watcher (if needed) +# +# Fixes covered: #11 (HuggingFace Xet throttle) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 7/12: Optimizing model downloads" + +optimize_model_download "$DS_DIR" diff --git a/dream-server/installers/p2p-gpu/phases/08-vastai-quirks.sh b/dream-server/installers/p2p-gpu/phases/08-vastai-quirks.sh new file mode 100644 index 000000000..138c75005 --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/08-vastai-quirks.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 08: Vast.ai Quirks +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: No-systemd workaround, /dev/shm remount, OpenCode crash-loop fix +# +# Expects: DS_DIR, DREAM_USER, log(), warn() +# Provides: Vast.ai-specific environment fixes applied +# +# Fixes covered: #18 (/dev/shm), #21 (no systemd), #22 (OpenCode crash-loop), +# #24 (/dev/shm too small) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 8/12: Applying Vast.ai-specific fixes" + +# ── No systemd ───────────────────────────────────────────────────────────── +if ! command -v systemctl &>/dev/null && ! pidof systemd &>/dev/null; then + log "No systemd detected — Vast.ai environment confirmed" + dream_cli="${DS_DIR}/dream-cli" + if [[ -x "$dream_cli" ]]; then + # Start host agent early on no-systemd hosts so model downloads and dashboard + # operations are available before the compose stack fully settles. + # [NON-FATAL: host-agent] Agent start can be retried in later phases. + su - "$DREAM_USER" -c "cd ${DS_DIR} && DREAM_HOME=${DS_DIR} ./dream-cli agent start" 2>&1 || \ + warn "Host agent start failed (non-fatal — will retry in phase 09)" + fi +fi + +# ── OpenCode crash-loop disable ──────────────────────────────────────────── +if docker ps -a --format '{{.Names}} {{.Status}}' 2>&1 | grep -q 'dream-opencode.*Restarting'; then + warn "OpenCode is crash-looping — disabling to unblock other services" + dream_cli="${DS_DIR}/dream-cli" + if [[ -x "$dream_cli" ]]; then + # [NON-FATAL: opencode] Individual service failure does not block others. + su - "$DREAM_USER" -c "cd ${DS_DIR} && ./dream-cli disable opencode" 2>&1 \ + || warn "dream-cli disable opencode failed (non-fatal)" + else + # [NON-FATAL: opencode] Individual service failure does not block others. + docker stop dream-opencode || warn "opencode stop failed (non-fatal)" + # [NON-FATAL: opencode] Individual service failure does not block others. + docker rm dream-opencode || warn "opencode rm failed (non-fatal)" + fi +fi + +# ── Shared memory fix ───────────────────────────────────────────────────── +shm_size_kb=$(df /dev/shm 2>&1 | awk 'NR==2{print $2}' || echo 0) +if [[ "${shm_size_kb:-0}" -lt 1048576 ]]; then + shm_mb=$(( shm_size_kb / 1024 )) + warn "/dev/shm is only ${shm_mb} MB — GPU containers may be memory-starved" + # [NON-FATAL: perf] Remount is a performance optimization only. + mount -o remount,size=4G /dev/shm || warn "/dev/shm remount failed (non-fatal)" +fi + +# ── Pre-pull Docker images ───────────────────────────────────────────────── +prepull_docker_images "$DS_DIR" + +log "Vast.ai environment fixes applied" diff --git a/dream-server/installers/p2p-gpu/phases/09-services.sh b/dream-server/installers/p2p-gpu/phases/09-services.sh new file mode 100644 index 000000000..cbb944fc3 --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/09-services.sh @@ -0,0 +1,312 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 09: Services & Health Check +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Start all services, run health-check loop with llama-server +# diagnostics, report per-service status +# +# Expects: DS_DIR, GPU_BACKEND, LOGFILE, log(), warn(), err(), +# env_get(), env_set(), start_services(), discover_all_services() +# Provides: Running DreamServer stack with status report +# +# Fixes covered: #10 (Dashboard stuck), #20 (llama-server hang), +# #23 (CUDA OOM), #25 (ComfyUI hang) +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 9/12: Starting services" + +# Verify the configured model file exists - llama-server will crash without it +_verify_model_file() { + local ds_dir="$1" + local env_file="${ds_dir}/.env" + local gguf_file models_dir + + gguf_file="$(env_get "$env_file" "GGUF_FILE")" + gguf_file="${gguf_file:-Qwen3.5-9B-Q4_K_M.gguf}" + models_dir="${ds_dir}/data/models" + + if [[ -f "${models_dir}/${gguf_file}" ]]; then + log "Model file verified: ${gguf_file} ($(du -h "${models_dir}/${gguf_file}" | cut -f1))" + return 0 + fi + + warn "Model file ${gguf_file} not found in ${models_dir}" + + # Check if any .gguf file exists as fallback + local fallback + fallback="$(find "$models_dir" -maxdepth 1 -name '*.gguf' -printf '%f\n' 2>/dev/null | head -1)" # stderr expected: find probe + if [[ -n "$fallback" ]]; then + log "Found fallback model: ${fallback} - updating GGUF_FILE in .env" + env_set "$env_file" "GGUF_FILE" "$fallback" + return 0 + fi + + warn "No .gguf model files found - llama-server will be unhealthy" + warn "Download a model: wget -P ${models_dir} https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf" +} + +# Verify that the model endpoint exposes at least one selectable model for Open WebUI. +_verify_model_visibility() { + local env_file="${DS_DIR}/.env" + local ollama_port webui_port + + ollama_port="$(env_get "$env_file" "OLLAMA_PORT")" + ollama_port="${ollama_port:-11434}" + webui_port="$(env_get "$env_file" "OPEN_WEBUI_PORT")" + webui_port="${webui_port:-3000}" + + local model_count=0 + local models_json + models_json="$(curl -sf --max-time 5 "http://127.0.0.1:${ollama_port}/v1/models" 2>/dev/null || echo "")" # stderr expected: service may not be ready + if [[ -n "$models_json" ]]; then + model_count="$(echo "$models_json" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("data", [])))' 2>/dev/null || echo 0)" # stderr expected: json parse may fail if the endpoint is not ready + fi + + if [[ "$model_count" -gt 0 ]]; then + log "LLM model visible on API (${model_count} model(s) on port ${ollama_port})" + else + warn "No models visible on llama-server API (port ${ollama_port}) — Open WebUI may show 'Model not selected'" + warn "Check: curl http://127.0.0.1:${ollama_port}/v1/models" + warn "Open WebUI port ${webui_port} should refresh after the model API becomes available" + fi +} + +# Multi-GPU: run topology detection and GPU-to-service assignment before startup +_verify_model_file "$DS_DIR" + +if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then + run_gpu_assignment "$DS_DIR" "${DS_DIR}/.env" +fi + +start_services "$DS_DIR" + +# ── Health-check loop with llama-server diagnostics ───────────────────────── +_run_health_check() { + local env_file="${DS_DIR}/.env" + local models_dir="${DS_DIR}/data/models" + local max_wait=120 elapsed=0 llama_diagnosed=false + + echo -n " Waiting for services " + while [[ $elapsed -lt $max_wait ]]; do + local healthy running dash_api_status dashboard_status webui_status + healthy=$(docker ps --filter "health=healthy" --format '{{.Names}}' | wc -l) + running=$(docker ps --format '{{.Names}}' | wc -l) + dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist + dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist + webui_status=$( + docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null || # stderr expected: container may not exist + docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null || # stderr expected: container may not exist + echo "missing" + ) + echo -n "." + + if [[ $healthy -ge 3 && "$dash_api_status" == "running" \ + && ( "$dashboard_status" == "running" || "$webui_status" == "running" ) ]]; then + echo "" + log "Core services healthy (${healthy}/${running} containers)" + return 0 + fi + + # Diagnose llama-server at 45s mark + if [[ $elapsed -ge 45 && "$llama_diagnosed" != "true" ]]; then + llama_diagnosed=true + _diagnose_llama "$env_file" "$models_dir" + fi + + sleep 5 + elapsed=$((elapsed + 5)) + done + + echo "" + warn "Health-check timeout (${max_wait}s) — some services may still be starting" +} + +_diagnose_llama() { + local env_file="$1" models_dir="$2" + local llama_status + llama_status=$(docker inspect --format '{{.State.Status}}' dream-llama-server 2>&1 || echo "missing") + + [[ "$llama_status" != "restarting" ]] && return 0 + + echo "" + warn "llama-server is crash-looping — diagnosing..." + local llama_logs + llama_logs=$(docker logs --tail 20 dream-llama-server 2>&1 || echo "") + + if echo "$llama_logs" | grep -qi "CUDA out of memory\|out of memory\|OOM"; then + _handle_oom "$env_file" "$models_dir" + elif echo "$llama_logs" | grep -qi "No such file\|model file not found\|failed to load"; then + _handle_missing_model "$env_file" "$models_dir" + elif echo "$llama_logs" | grep -qi "address already in use\|bind failed"; then + err "Port conflict on llama-server port!" + warn "Check: ss -tlnp | grep :8080" + fi +} + +_handle_oom() { + local env_file="$1" models_dir="$2" + err "Model too large for GPU VRAM!" + warn "Switching to smallest bootstrap model..." + + local tiny_url="https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf" + local tiny_name="Qwen3-0.6B-Q4_K_M.gguf" + if [[ ! -f "${models_dir}/${tiny_name}" ]]; then + aria2c -x 8 -s 8 -d "$models_dir" -o "$tiny_name" "$tiny_url" 2>&1 || \ + curl -sfL -o "${models_dir}/${tiny_name}" "$tiny_url" + fi + env_set "$env_file" "GGUF_FILE" "$tiny_name" + # [NON-FATAL: llama] Individual service failure does not block others. + docker restart dream-llama-server || warn "llama-server restart failed (non-fatal)" + echo -n " Retrying with smaller model " +} + +_handle_missing_model() { + local env_file="$1" models_dir="$2" + err "Model file not found by llama-server!" + local current_gguf + current_gguf=$(env_get "$env_file" "GGUF_FILE") + if [[ -n "$current_gguf" && ! -f "${models_dir}/${current_gguf}" ]]; then + warn "GGUF_FILE='${current_gguf}' does not exist in ${models_dir}/" + local fallback + fallback=$(find "$models_dir" -name "*.gguf" -size +50M 2>&1 | head -1 | xargs -r basename || echo "") + if [[ -n "$fallback" ]]; then + env_set "$env_file" "GGUF_FILE" "$fallback" + # [NON-FATAL: llama] Individual service failure does not block others. + docker restart dream-llama-server || warn "llama-server restart failed (non-fatal)" + warn "Switched to ${fallback}" + fi + fi +} + +_run_health_check +_verify_model_visibility + +# ── Service status report ────────────────────────────────────────────────── +_report_service_status() { + echo "" + echo -e "${BOLD}Service Status:${NC}" + echo "" + + local -a core_services=( + "llama-server|dream-llama-server" + "open-webui|dream-webui" + "dashboard|dream-dashboard" + "dashboard-api|dream-dashboard-api" + ) + local -a heavy_services=() + local -a normal_services=() + + while IFS='|' read -r sid _pe _pd _name _cat _proxy startup _cname; do + [[ -z "$sid" ]] && continue + case "$sid" in open-webui|dashboard|dashboard-api) continue ;; esac + local container_name="${_cname:-dream-${sid}}" + if [[ "$startup" == "heavy" ]]; then + heavy_services+=("${sid}|${container_name}") + else + normal_services+=("${sid}|${container_name}") + fi + done < <(discover_all_services "$DS_DIR") + + _report_containers "${core_services[@]}" + _report_heavy "${heavy_services[@]}" + _report_normal "${normal_services[@]}" + _report_background_downloads + + echo "" +} + +_report_containers() { + for entry in "$@"; do + local svc container + IFS='|' read -r svc container <<< "$entry" + [[ -z "$container" ]] && container="dream-${svc}" + + local status health + if ! status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null); then # stderr expected: container may not exist + status="not found" + fi + if ! health=$(docker inspect --format '{{.State.Health.Status}}' "$container" 2>/dev/null); then # stderr expected: container may not expose healthcheck + health="none" + fi + + if [[ "$health" == "healthy" ]]; then + echo -e " ${GREEN}✓${NC} ${svc}: healthy" + elif [[ "$status" == "running" ]]; then + echo -e " ${YELLOW}◌${NC} ${svc}: starting up..." + elif [[ "$status" == "restarting" ]]; then + echo -e " ${RED}↻${NC} ${svc}: restarting (check: docker logs ${container})" + elif [[ "$status" == "not found" ]]; then + echo -e " ${DIM}·${NC} ${svc}: not deployed" + else + echo -e " ${RED}✗${NC} ${svc}: ${status}" + fi + done +} + +_report_heavy() { + for entry in "$@"; do + local svc container + IFS='|' read -r svc container <<< "$entry" + [[ -z "$container" ]] && container="dream-${svc}" + + local status + if ! status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null); then # stderr expected: container may not exist + status="not found" + fi + [[ "$status" == "not found" || "$status" == "exited" ]] && continue + + local health + if ! health=$(docker inspect --format '{{.State.Health.Status}}' "$container" 2>/dev/null); then # stderr expected: container may not expose healthcheck + health="none" + fi + if [[ "$health" == "healthy" ]]; then + echo -e " ${GREEN}✓${NC} ${svc}: ready" + elif [[ "$status" == "running" ]]; then + echo -e " ${CYAN}↓${NC} ${svc}: initializing in background" + elif [[ "$status" == "restarting" ]]; then + echo -e " ${YELLOW}↻${NC} ${svc}: restarting (downloading models)" + fi + done +} + +_report_normal() { + for entry in "$@"; do + local svc container + IFS='|' read -r svc container <<< "$entry" + [[ -z "$container" ]] && container="dream-${svc}" + + local status + if ! status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null); then # stderr expected: container may not exist + status="not found" + fi + [[ "$status" == "not found" || "$status" == "exited" ]] && continue + + local health + if ! health=$(docker inspect --format '{{.State.Health.Status}}' "$container" 2>/dev/null); then # stderr expected: container may not expose healthcheck + health="none" + fi + if [[ "$health" == "healthy" ]]; then + echo -e " ${GREEN}✓${NC} ${svc}: healthy" + elif [[ "$status" == "running" ]]; then + echo -e " ${YELLOW}◌${NC} ${svc}: starting up..." + fi + done +} + +_report_background_downloads() { + if pgrep -f "aria2c.*gguf" > /dev/null 2>&1; then + echo -e " ${CYAN}↓${NC} LLM model: upgrading in background (aria2c)" + echo " Monitor: tail -f ${DS_DIR}/logs/aria2c-download.log" + fi + local bg_upgrade="${DS_DIR}/logs/model-upgrade.log" + if [[ -f "$bg_upgrade" ]] && pgrep -f "model-upgrade\|model.*download" > /dev/null 2>&1; then + echo -e " ${CYAN}↓${NC} LLM model: upgrading in background (DreamServer)" + fi +} + +_report_service_status diff --git a/dream-server/installers/p2p-gpu/phases/10-voice-stack.sh b/dream-server/installers/p2p-gpu/phases/10-voice-stack.sh new file mode 100644 index 000000000..7c8f82a92 --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/10-voice-stack.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 10: Voice Stack +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Bootstrap Whisper ASR model + Kokoro TTS readiness gate +# +# Expects: DS_DIR, log(), ensure_whisper_asr_model(), ensure_tts_model_ready() +# Provides: Voice services (STT/TTS) initialized with models +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 10/12: Verifying TTS/STT model availability" + +ensure_whisper_asr_model "$DS_DIR" +ensure_tts_model_ready "$DS_DIR" + +_check_open_webui_health() { + local env_file="${DS_DIR}/.env" + local webui_port + webui_port="$(env_get "$env_file" "WEBUI_PORT")" + webui_port="${webui_port:-3000}" + + if docker ps --format '{{.Names}}' | grep -qx 'dream-webui'; then + if ! wait_for_http "http://127.0.0.1:${webui_port}/health" 60 4; then + warn "Open WebUI not healthy yet — STT requests may return server connection errors" + fi + fi +} + +_check_open_webui_health diff --git a/dream-server/installers/p2p-gpu/phases/11-access-layer.sh b/dream-server/installers/p2p-gpu/phases/11-access-layer.sh new file mode 100644 index 000000000..29dfffa3c --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/11-access-layer.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 11: Access Layer +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Cloudflare tunnel, SSH tunnel scripts, and access guidance +# +# Expects: DS_DIR, GPU_BACKEND, log(), warn(), setup_cloudflare_tunnel(), +# generate_ssh_tunnel_script(), +# generate_powershell_tunnel_script(), +# comfyui_preload_models() +# Provides: All access methods configured for Vast.ai connectivity +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 11/12: Setting up access layer" + +# ComfyUI extra model downloads (if configured) +comfyui_preload_models "$DS_DIR" "$GPU_BACKEND" + +# Prefer SSH tunnel mode for Vast.ai reliability and Windows compatibility. +log "Using SSH tunnel mode for access (no public reverse-proxy URLs shown)" + +# Optional Cloudflare Tunnel +setup_cloudflare_tunnel "$DS_DIR" + +# Auto-reconnecting SSH tunnel script +generate_ssh_tunnel_script "$DS_DIR" +generate_powershell_tunnel_script "$DS_DIR" diff --git a/dream-server/installers/p2p-gpu/phases/12-summary.sh b/dream-server/installers/p2p-gpu/phases/12-summary.sh new file mode 100644 index 000000000..5e95bf845 --- /dev/null +++ b/dream-server/installers/p2p-gpu/phases/12-summary.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Phase 12: Summary +# ============================================================================ +# Part of: p2p-gpu/phases/ +# Purpose: Print access info, connection methods, final success message +# +# Expects: DS_DIR, LOGFILE, log(), print_access_info(), _ts() +# Provides: User-facing summary of all access methods +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +step "Phase 12/12: Setup complete" + +print_access_info "$DS_DIR" + +# [NON-FATAL: logging] Summary logging should not block completion. +echo "=== Setup completed at $(_ts) ===" >> "$LOGFILE" || warn "logfile write failed (non-fatal)" +log "Setup complete! Core services ready. Heavy services downloading in background." diff --git a/dream-server/installers/p2p-gpu/setup.sh b/dream-server/installers/p2p-gpu/setup.sh new file mode 100755 index 000000000..56d16dd77 --- /dev/null +++ b/dream-server/installers/p2p-gpu/setup.sh @@ -0,0 +1,193 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Deploy Orchestrator +# ============================================================================ +# Deploy DreamServer on peer-to-peer GPU marketplaces (Vast.ai) +# +# Target: Remote GPU instance (NVIDIA, AMD, or CPU-only) +# OS: Ubuntu 22.04 / 24.04 +# License: Apache-2.0 (same as DreamServer) +# +# Usage: +# bash setup.sh # Full install +# bash setup.sh --resume # Quick restart (re-apply fixes + start) +# bash setup.sh --status # Health check +# bash setup.sh --info # Show connection URLs +# bash setup.sh --fix # Apply fixes + restart (no reinstall) +# bash setup.sh --teardown # Stop all services +# +# This file sources library modules (pure functions) then runs each install +# phase in order. Modules live under: +# lib/ — reusable function libraries +# phases/ — sequential install steps (execute on source) +# subcommands/ — alternative entry points (--teardown, --status, etc.) +# +# Design: adapted from DreamServer CLAUDE.md for provider environments +# Let It Crash > KISS > Pure Functions > SOLID +# set -euo pipefail everywhere. Non-fatal paths use || warn (per +# CLAUDE.md §4) because on rented hardware, partial stack > dead stack. +# ============================================================================ + +set -euo pipefail +IFS=$'\n\t' + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="$(basename "$0")" +DRY_RUN=false + +# ── Source libraries ──────────────────────────────────────────────────────── +source "${SCRIPT_DIR}/lib/constants.sh" +source "${SCRIPT_DIR}/lib/logging.sh" +source "${SCRIPT_DIR}/lib/environment.sh" +source "${SCRIPT_DIR}/lib/permissions.sh" +source "${SCRIPT_DIR}/lib/services.sh" +source "${SCRIPT_DIR}/lib/networking.sh" +source "${SCRIPT_DIR}/lib/models.sh" +source "${SCRIPT_DIR}/lib/gpu-topology.sh" +source "${SCRIPT_DIR}/lib/compatibility.sh" + +# ── Source subcommands ────────────────────────────────────────────────────── +source "${SCRIPT_DIR}/subcommands/teardown.sh" +source "${SCRIPT_DIR}/subcommands/status.sh" +source "${SCRIPT_DIR}/subcommands/resume.sh" +source "${SCRIPT_DIR}/subcommands/fix.sh" +source "${SCRIPT_DIR}/subcommands/info.sh" + +# ── Subcommand routing ───────────────────────────────────────────────────── +_route_subcommand() { + case "${1:-}" in + --teardown|teardown) cmd_teardown; exit 0 ;; + --status|status) cmd_status; exit 0 ;; + --resume|resume) cmd_resume; exit 0 ;; + --fix|fix) cmd_fix; exit 0 ;; + --info|info) cmd_info; exit 0 ;; + --dry-run) DRY_RUN=true ;; + --version) echo "dreamserver-vastai-setup v${VASTAI_VERSION}"; exit 0 ;; + --help|-h) _print_help; exit 0 ;; + --*) err "Unknown option: ${1}"; echo "Run 'bash ${SCRIPT_NAME} --help'"; exit 1 ;; + esac +} + +_print_help() { + echo "" + echo -e "${BOLD}DreamServer — Vast.ai Setup v${VASTAI_VERSION}${NC}" + echo "" + echo -e "${BOLD}Usage:${NC} bash ${SCRIPT_NAME} [COMMAND]" + echo "" + echo -e "${BOLD}Commands:${NC}" + echo " (no args) Full install (first time) or re-install" + echo " --resume Quick restart — re-apply fixes and start services" + echo " --status Health check — show GPU, containers, ports" + echo " --info Show connection URLs and SSH tunnel commands" + echo " --fix Apply latest fixes without full re-install" + echo " --teardown Stop all services" + echo " --dry-run Preview what would happen without making changes" + echo " --help Show this help" + echo "" + echo -e "${BOLD}Common scenarios:${NC}" + echo " First time: bash ${SCRIPT_NAME}" + echo " SSH dropped: bash ${SCRIPT_NAME} --resume" + echo " Services broken: bash ${SCRIPT_NAME} --fix" + echo " Check status: bash ${SCRIPT_NAME} --status" + echo " Done for the day: bash ${SCRIPT_NAME} --teardown" + echo "" +} + +# ── Smart re-run detection ────────────────────────────────────────────────── +_check_existing_install() { + local existing_dir + existing_dir=$(find_dream_dir 2>&1 || echo "") + if [[ -n "$existing_dir" && -f "${existing_dir}/.env" ]]; then + local running_count + running_count=$(docker ps --format '{{.Names}}' 2>&1 | grep -c '^dream-' || echo 0) + if [[ "$running_count" -gt 0 ]]; then + echo "" + echo -e "${YELLOW}${BOLD} DreamServer is already installed (${running_count} services running).${NC}" + echo "" + echo -e " You probably want:" + echo -e " ${BOLD}bash ${SCRIPT_NAME} --resume${NC} → Quick restart + fixes" + echo -e " ${BOLD}bash ${SCRIPT_NAME} --fix${NC} → Apply fixes only" + echo -e " ${BOLD}bash ${SCRIPT_NAME} --status${NC} → Check health" + echo "" + echo -n -e " Continue with full re-install? [y/N] " + local answer + read -r -t 15 answer || answer="n" + if [[ "${answer,,}" != "y" && "${answer,,}" != "yes" ]]; then + log "Aborted. Use --resume, --fix, --status, or --info." + exit 0 + fi + echo "" + fi + fi +} + +# ── Main install flow ────────────────────────────────────────────────────── +main() { + _route_subcommand "${1:-}" + + # ── Full install ────────────────────────────────────────────────────── + echo "" + echo -e "${CYAN}${BOLD} DreamServer — Vast.ai Setup v${VASTAI_VERSION}${NC}" + echo -e "${DIM} https://github.com/Light-Heart-Labs/DreamServer${NC}" + echo "" + + setup_cleanup_trap + acquire_lock + mkdir -p "$(dirname "$LOGFILE")" + # [NON-FATAL: logging] Setup can proceed even if the logfile is unwritable. + echo "=== Setup started at $(_ts) ===" >> "$LOGFILE" || warn "logfile write failed (non-fatal)" + + _check_existing_install + + # ── Dry-run mode: preview without executing ──────────────────────── + if [[ "$DRY_RUN" == "true" ]]; then + echo "" + echo -e "${BOLD}Dry-run mode — no changes will be made.${NC}" + echo "" + echo "This setup would:" + echo " 1. Detect GPU and validate system requirements" + echo " 2. Install dependencies (sudo, git, curl, jq, aria2, etc.)" + echo " 3. Create 'dream' user with Docker access" + echo " 4. Clone DreamServer from ${REPO_URL:-Light-Heart-Labs/DreamServer}" + echo " 5. Run DreamServer installer (non-interactive, 600s timeout)" + echo " 6. Apply post-install fixes (permissions, env defaults)" + echo " 7. Download/verify GGUF model for llama-server" + echo " 8. Apply Vast.ai-specific quirks (/dev/shm, no-systemd)" + echo " 9. Start Docker Compose services + health check" + echo " 10. Bootstrap voice stack (Whisper + Kokoro TTS)" + echo " 11. Set up reverse proxy (Caddy) + access tunnels" + echo " 12. Print connection info and SSH tunnel commands" + echo "" + echo -e "${BOLD}System:${NC}" + detect_gpu + echo " GPU: ${GPU_NAME} (${GPU_BACKEND}, ${GPU_VRAM} MB VRAM)" + echo " CPU: $(nproc) cores" + echo " Disk: $(df -BG --output=avail . 2>>"$LOGFILE" | tail -1 | tr -dc '0-9')GB available" + echo " Docker: $(docker --version 2>>"$LOGFILE" || echo 'not installed')" + echo "" + echo "Run without --dry-run to proceed." + exit 0 + fi + + # Shared state variables (set by phases, used across phases) + GPU_BACKEND="" GPU_NAME="" GPU_VRAM="" GPU_COUNT=0 + CPU_COUNT=0 DISK_AVAIL_GB=0 COMPOSE_CMD="" + REPO_DIR="" DS_DIR="" + + # ── Execute phases in order ─────────────────────────────────────────── + source "${SCRIPT_DIR}/phases/00-preflight.sh" + source "${SCRIPT_DIR}/phases/01-dependencies.sh" + source "${SCRIPT_DIR}/phases/02-user-setup.sh" + source "${SCRIPT_DIR}/phases/03-repository.sh" + source "${SCRIPT_DIR}/phases/04-installer.sh" + source "${SCRIPT_DIR}/phases/05-post-install.sh" + source "${SCRIPT_DIR}/phases/06-bootstrap-model.sh" + source "${SCRIPT_DIR}/phases/07-model-optimize.sh" + source "${SCRIPT_DIR}/phases/08-vastai-quirks.sh" + source "${SCRIPT_DIR}/phases/09-services.sh" + source "${SCRIPT_DIR}/phases/10-voice-stack.sh" + source "${SCRIPT_DIR}/phases/11-access-layer.sh" + source "${SCRIPT_DIR}/phases/12-summary.sh" +} + +main "$@" diff --git a/dream-server/installers/p2p-gpu/subcommands/fix.sh b/dream-server/installers/p2p-gpu/subcommands/fix.sh new file mode 100644 index 000000000..a522eee56 --- /dev/null +++ b/dream-server/installers/p2p-gpu/subcommands/fix.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Subcommand: fix +# ============================================================================ +# Part of: p2p-gpu/subcommands/ +# Purpose: Apply fixes without full reinstall (port rebind, network fix, +# CPU cap, permissions, service restart) +# +# Expects: log(), warn(), err(), find_dream_dir(), detect_gpu_backend(), +# expose_ports_for_vastai(), apply_post_install_fixes(), +# start_services(), ensure_whisper_asr_model(), ensure_tts_model_ready(), +# generate_ssh_tunnel_script(), +# generate_powershell_tunnel_script(), print_access_info(), +# get_compose_cmd() +# Provides: All runtime fixes applied and services restarted +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + + cmd_fix() { + step "Applying fixes (no reinstall)" + local ds_dir + ds_dir=$(find_dream_dir) || { err "DreamServer directory not found. Run full install first."; exit 1; } + + cd "$ds_dir" || exit 1 + detect_gpu + local gpu_backend="$GPU_BACKEND" + + expose_ports_for_vastai "$ds_dir" + + # Fix stale Docker network + if docker network inspect dream-network >/dev/null 2>&1; then + local net_label + net_label=$(docker network inspect dream-network \ + --format '{{index .Labels "com.docker.compose.network"}}' 2>&1 || echo "") + if [[ -z "$net_label" ]]; then + log "Fixing stale dream-network..." + local compose_cmd + compose_cmd=$(get_compose_cmd) + if [[ "$compose_cmd" == "docker compose" ]]; then + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker compose down 2>&1 || warn "compose down failed (non-fatal)" + else + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker-compose down 2>&1 || warn "compose down failed (non-fatal)" + fi + for cid in $(docker network inspect dream-network \ + -f '{{range .Containers}}{{.Name}} {{end}}' 2>&1 || echo ""); do + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker network disconnect -f dream-network "$cid" || warn "disconnect ${cid} failed (non-fatal)" + done + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker network rm dream-network || warn "network rm failed (non-fatal)" + log "Stale network removed — compose will recreate on next start" + fi + fi + + apply_post_install_fixes "$ds_dir" "$gpu_backend" + if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then + enumerate_gpus + run_gpu_assignment "$ds_dir" "${ds_dir}/.env" + fi + + log "Fixes applied. Restarting services..." + start_services "$ds_dir" + ensure_whisper_asr_model "$ds_dir" + ensure_tts_model_ready "$ds_dir" + + generate_ssh_tunnel_script "$ds_dir" + generate_powershell_tunnel_script "$ds_dir" + + print_access_info "$ds_dir" + log "Fix complete!" +} diff --git a/dream-server/installers/p2p-gpu/subcommands/info.sh b/dream-server/installers/p2p-gpu/subcommands/info.sh new file mode 100644 index 000000000..e15f00d50 --- /dev/null +++ b/dream-server/installers/p2p-gpu/subcommands/info.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Subcommand: info +# ============================================================================ +# Part of: p2p-gpu/subcommands/ +# Purpose: Print connection details only (no modifications) +# +# Expects: err(), find_dream_dir(), print_access_info() +# Provides: Display of all access methods and URLs +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +cmd_info() { + local ds_dir + ds_dir=$(find_dream_dir) || { err "DreamServer directory not found. Run full install first."; exit 1; } + print_access_info "$ds_dir" +} diff --git a/dream-server/installers/p2p-gpu/subcommands/resume.sh b/dream-server/installers/p2p-gpu/subcommands/resume.sh new file mode 100644 index 000000000..d34573154 --- /dev/null +++ b/dream-server/installers/p2p-gpu/subcommands/resume.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Subcommand: resume +# ============================================================================ +# Part of: p2p-gpu/subcommands/ +# Purpose: Quick restart — re-apply fixes and start services +# +# Expects: log(), warn(), err(), find_dream_dir(), detect_gpu_backend(), +# apply_post_install_fixes(), start_services(), +# ensure_whisper_asr_model(), ensure_tts_model_ready(), +# generate_ssh_tunnel_script(), generate_powershell_tunnel_script(), +# print_access_info() +# Provides: Running DreamServer with latest fixes applied +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +cmd_resume() { + step "Resuming DreamServer" + local ds_dir + ds_dir=$(find_dream_dir) || { err "DreamServer directory not found"; exit 1; } + + cd "$ds_dir" || exit 1 + detect_gpu + local gpu_backend="$GPU_BACKEND" + + apply_post_install_fixes "$ds_dir" "$gpu_backend" + if [[ "${GPU_COUNT:-0}" -ge "${MULTIGPU_MIN_GPUS:-2}" ]]; then + enumerate_gpus + run_gpu_assignment "$ds_dir" "${ds_dir}/.env" + fi + start_services "$ds_dir" + print_access_info "$ds_dir" + + # Keep the remaining resume steps after the access summary so a later + # optional failure does not hide the URLs and commands from the terminal. + ensure_whisper_asr_model "$ds_dir" + ensure_tts_model_ready "$ds_dir" + generate_ssh_tunnel_script "$ds_dir" + generate_powershell_tunnel_script "$ds_dir" +} diff --git a/dream-server/installers/p2p-gpu/subcommands/status.sh b/dream-server/installers/p2p-gpu/subcommands/status.sh new file mode 100644 index 000000000..7a808328b --- /dev/null +++ b/dream-server/installers/p2p-gpu/subcommands/status.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# ============================================================================ +# Dream Server — Vast.ai Subcommand: status +# ============================================================================ +# Part of: p2p-gpu/subcommands/ +# Purpose: Display GPU info, container status, download progress +# +# Expects: log(), warn(), err(), find_dream_dir() +# Provides: Health status overview +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +cmd_status() { + local ds_dir + ds_dir=$(find_dream_dir) || { err "DreamServer directory not found"; exit 1; } + + echo -e "\n${BOLD}DreamServer Status${NC}\n" + + # GPU info + local gpu_backend + gpu_backend=$(detect_gpu_backend) + case "$gpu_backend" in + nvidia) + if nvidia-smi --query-gpu=name,memory.total,memory.used,utilization.gpu \ + --format=csv,noheader 2>>"$LOGFILE" | while IFS=',' read -r name mem_total mem_used util; do + echo -e " GPU: ${CYAN}${name}${NC} | VRAM: ${mem_used} /${mem_total} | Util: ${util}" + done; then + : + else + warn "NVIDIA backend detected but nvidia-smi query failed" + fi + ;; + amd) + if command -v rocm-smi >/dev/null 2>&1; then + local amd_name amd_vram + amd_name=$(rocm-smi --showproductname 2>>"$LOGFILE" | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU") + amd_vram=$(rocm-smi --showmeminfo vram 2>>"$LOGFILE" | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0") + if [[ "${amd_vram:-0}" -gt 1000000 ]]; then + amd_vram=$(( amd_vram / 1048576 )) + fi + echo -e " GPU: ${CYAN}${amd_name}${NC} | VRAM: ${amd_vram} MiB" + else + warn "AMD backend detected but rocm-smi is not available" + fi + ;; + *) + echo " GPU: CPU-only mode (no accelerator detected)" + ;; + esac + + echo "" + docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" 2>&1 | head -20 + + echo "" + local healthy running total + healthy=$(docker ps --filter "health=healthy" --format '{{.Names}}' | wc -l) + running=$(docker ps --format '{{.Names}}' | wc -l) + total=$(docker ps -a --format '{{.Names}}' | grep -c '^dream-' || echo 0) + echo -e " Containers: ${GREEN}${healthy}${NC} healthy / ${running} running / ${total} total" + + if pgrep -f "aria2c.*gguf" > /dev/null 2>&1; then + echo -e " Model download: ${YELLOW}in progress${NC}" + local dl_log="${ds_dir}/logs/aria2c-download.log" + [[ -f "$dl_log" ]] && tail -1 "$dl_log" 2>&1 | sed 's/^/ /' + fi + echo "" +} diff --git a/dream-server/installers/p2p-gpu/subcommands/teardown.sh b/dream-server/installers/p2p-gpu/subcommands/teardown.sh new file mode 100644 index 000000000..688ca0e99 --- /dev/null +++ b/dream-server/installers/p2p-gpu/subcommands/teardown.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# ============================================================================ +# DreamServer — P2P GPU Subcommand: teardown +# ============================================================================ +# Part of: dream-server/installers/p2p-gpu/subcommands/ +# Purpose: Stop all containers and background processes to stop all services +# +# Expects: log(), warn(), err(), find_dream_dir(), get_compose_cmd(), +# _kill_stored_pid(), PIDFILE_DIR, SCRIPT_NAME +# Provides: Clean shutdown of all DreamServer services +# +# SPDX-License-Identifier: Apache-2.0 +# ============================================================================ + +set -euo pipefail + +cmd_teardown() { + step "Teardown — stopping all services" + local ds_dir + ds_dir=$(find_dream_dir) || { err "DreamServer directory not found"; exit 1; } + + cd "$ds_dir" || exit 1 + + if [[ -f "docker-compose.base.yml" ]]; then + local compose_cmd + compose_cmd=$(get_compose_cmd) + if [[ "$compose_cmd" == "docker compose" ]]; then + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker compose down --remove-orphans 2>&1 || warn "Compose down had warnings (non-fatal)" + else + # [NON-FATAL: cleanup] Best-effort teardown — partial cleanup is better than none. + docker-compose down --remove-orphans 2>&1 || warn "Compose down had warnings (non-fatal)" + fi + fi + + # [FIX: pkill] Use PID-file based cleanup instead of pkill -f + _kill_stored_pid "aria2c-model" + _kill_stored_pid "model-swap-watcher" + _kill_stored_pid "cloudflared" + + log "All services stopped. Storage billing continues." + log "To fully stop billing: delete the instance from the provider console." + echo "" + echo -e "${BOLD}Data preserved at:${NC} ${ds_dir}/data/" + echo -e "${BOLD}To resume:${NC} bash ${SCRIPT_NAME} --resume" +} diff --git a/dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh b/dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh new file mode 100644 index 000000000..ae1e48b64 --- /dev/null +++ b/dream-server/installers/p2p-gpu/tests/test-nvml-mismatch.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Regression: ensure NVML mismatch repair path is reachable under set -e. +set -euo pipefail + +P2P_GPU_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +LOGFILE="$(mktemp -t p2p-gpu-nvml.XXXXXX)" +STUB_DIR="$(mktemp -d -t p2p-gpu-stub.XXXXXX)" +APT_CALLED_FILE="${STUB_DIR}/apt-called" +trap 'rm -f "$LOGFILE"; rm -rf "$STUB_DIR"' EXIT + +# Minimal logging functions expected by environment.sh +log() { :; } +warn() { :; } +err() { :; } +step() { :; } + +assert_no_apt_call() { + if [[ -e "$APT_CALLED_FILE" ]]; then + echo "Expected repair path to skip apt-get" >&2 + exit 1 + fi +} + +# shellcheck source=../lib/environment.sh +source "${P2P_GPU_DIR}/lib/environment.sh" + +# Force mismatch status to validate repair path. +detect_nvml_mismatch() { + return 1 +} + +export PATH="${STUB_DIR}:${PATH}" +export APT_CALLED_FILE + +cat >"${STUB_DIR}/apt-get" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +echo "called" >> "${APT_CALLED_FILE}" +exit 0 +EOF + +cat >"${STUB_DIR}/systemctl" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +exit 0 +EOF + +cat >"${STUB_DIR}/service" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +exit 0 +EOF + +chmod +x "${STUB_DIR}/apt-get" "${STUB_DIR}/systemctl" "${STUB_DIR}/service" + +sleep() { :; } + +if repair_nvml_mismatch; then + repair_status=0 +else + repair_status=$? +fi + +if [[ "$repair_status" -ne 1 ]]; then + echo "Expected repair_nvml_mismatch to return 1 when mismatch persists" >&2 + exit 1 +fi + +if [[ ! -s "$APT_CALLED_FILE" ]]; then + echo "Expected repair path to invoke apt-get for NVML mismatch" >&2 + exit 1 +fi + +rm -f "$APT_CALLED_FILE" + +detect_nvml_mismatch() { + return 2 +} + +if repair_nvml_mismatch; then + repair_status=0 +else + repair_status=$? +fi + +if [[ "$repair_status" -ne 1 ]]; then + echo "Expected repair_nvml_mismatch to return 1 when detection fails" >&2 + exit 1 +fi + +assert_no_apt_call