Light-Heart-Labs
diff --git a/‎.github/workflows/p2p-gpu.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/p2p-gpu.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎resources/p2p-gpu/README.md‎
Lines changed: 3 additions & 3 deletions b/‎resources/p2p-gpu/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎resources/p2p-gpu/lib/environment.sh‎
Lines changed: 12 additions & 12 deletions b/‎resources/p2p-gpu/lib/environment.sh‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎resources/p2p-gpu/lib/gpu-topology.sh‎
Lines changed: 9 additions & 9 deletions b/‎resources/p2p-gpu/lib/gpu-topology.sh‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎resources/p2p-gpu/lib/models.sh‎
Lines changed: 12 additions & 12 deletions b/‎resources/p2p-gpu/lib/models.sh‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎resources/p2p-gpu/lib/networking.sh‎
Lines changed: 10 additions & 8 deletions b/‎resources/p2p-gpu/lib/networking.sh‎
Lines changed: 10 additions & 8 deletions
@@ -1,4 +1,9 @@
 name: P2P GPU Checks
+# Maintainer note: This CI is scoped to resources/p2p-gpu/** paths only and
+# runs zero tests against core DreamServer. It exists because the toolkit is
+# ~4,500 lines of bash executed as root on rented GPU instances — syntax and
+# lint correctness matter more here than in typical resources/ add-ons.
+# The smoke tests use a mock Docker binary; no real containers are started.
 
 on:
   pull_request:
 
@@ -113,12 +113,12 @@ p2p-gpu/
 
 Aligned with DreamServer's [CLAUDE.md](../../CLAUDE.md):
 
-- **Let It Crash** — `set -euo pipefail` everywhere; errors kill the process
+- **Let It Crash** — `set -euo pipefail` everywhere; errors crash unless explicitly marked non-fatal with `|| warn` (per CLAUDE.md). On rented GPU instances, a working dashboard with a broken ComfyUI beats a dead stack the user is paying for.
 - **KISS** — readable over clever; one function, one job
 - **Pure Functions** — libs have no side effects; phases are the imperative shell
 - **Manifest-Driven** — services auto-discovered from extension manifests (no hardcoded lists)
 - **PID-file tracking** — background processes tracked safely (no `pkill -f`)
-- **ACL-primary permissions** — setgid + POSIX ACLs; `chmod a+rwX` only as documented fallback
+- **ACL-primary permissions** — setgid + POSIX ACLs required (hard fail if unavailable); `chmod a+rwX` only for documented multi-UID directories where ACLs cannot express the pattern
 
 ## Commands
 
@@ -139,7 +139,7 @@ Aligned with DreamServer's [CLAUDE.md](../../CLAUDE.md):
 - Dashboard model downloads (`/models` page) require the Dream host agent; setup now auto-starts it during service startup.
 
 ```bash
-MODEL="Qwen3-30B-A3B-Q4_K_M.gguf"; DS_DIR="${DS_DIR:-/home/dream/dream-server}"; LLM_MODEL="$(echo "$MODEL" | sed -E 's/\.(gguf|GGUF)$//' | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' | tr '[:upper:]' '[:lower:]')"; cd "$DS_DIR" && sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${MODEL}|" .env && { grep -q '^LLM_MODEL=' .env && sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${LLM_MODEL}|" .env || echo "LLM_MODEL=${LLM_MODEL}" >> .env; } && docker compose $(cat .compose-flags 2>/dev/null) up -d llama-server && for c in dream-dreamforge dream-openclaw dream-dashboard-api dream-webui; do docker ps --format '{{.Names}}' | grep -qx "$c" && docker restart "$c" >/dev/null || true; done
+MODEL="Qwen3-30B-A3B-Q4_K_M.gguf"; DS_DIR="${DS_DIR:-/home/dream/dream-server}"; LLM_MODEL="$(echo "$MODEL" | sed -E 's/\.(gguf|GGUF)$//' | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' | tr '[:upper:]' '[:lower:]')"; cd "$DS_DIR" && sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${MODEL}|" .env && { grep -q '^LLM_MODEL=' .env && sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${LLM_MODEL}|" .env || echo "LLM_MODEL=${LLM_MODEL}" >> .env; } && docker compose $(cat .compose-flags 2>/dev/null) up -d llama-server && for c in dream-dreamforge dream-openclaw dream-dashboard-api dream-webui; do docker ps --format '{{.Names}}' | grep -qx "$c" && docker restart "$c" >/dev/null || echo "[warn] ${c} restart failed (non-fatal)" >&2; done
 ```
 
 ```bash
 
@@ -47,7 +47,7 @@ env_set() {
 env_get() {
   local file="$1" key="$2"
   [[ ! -f "$file" ]] && return 0
-  grep "^${key}=" "$file" 2>/dev/null | head -1 | cut -d= -f2- \
+  grep "^${key}=" "$file" 2>>"$LOGFILE" | head -1 | cut -d= -f2- \
     | sed 's/[[:space:]]#.*$//' | tr -d '"' | tr -d "'" || echo ""
 }
 
@@ -168,13 +168,13 @@ cap_cpu_in_files() {
 get_compose_cpu_ceiling() {
   local host_nproc docker_ncpu ceiling
 
-  host_nproc=$(nproc 2>/dev/null || echo 1)
+  host_nproc=$(nproc 2>>"$LOGFILE" || echo 1)
   if [[ ! "$host_nproc" =~ ^[0-9]+$ ]] || [[ "$host_nproc" -lt 1 ]]; then
     host_nproc=1
   fi
 
   ceiling="$host_nproc"
-  docker_ncpu=$(docker info --format '{{.NCPU}}' 2>/dev/null || echo "")
+  docker_ncpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "")
   if [[ "$docker_ncpu" =~ ^[0-9]+$ ]] && [[ "$docker_ncpu" -gt 0 ]] && [[ "$docker_ncpu" -lt "$ceiling" ]]; then
     ceiling="$docker_ncpu"
   fi
@@ -238,23 +238,23 @@ detect_gpu() {
 
   if command -v nvidia-smi &>/dev/null && nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null 2>&1; then
     GPU_BACKEND="nvidia"
-    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | xargs)
-    GPU_VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs)
-    GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l)
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | head -1 | xargs)
+    GPU_VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE" | head -1 | xargs)
+    GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | wc -l)
     GPU_TOTAL_VRAM=0
     while read -r v; do GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + v )); done \
-      < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null)
+      < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE")
     if [[ $GPU_TOTAL_VRAM -eq 0 ]]; then GPU_TOTAL_VRAM=$GPU_VRAM; fi
 
   elif command -v rocm-smi &>/dev/null || [[ -e /dev/kfd ]]; then
     GPU_BACKEND="amd"
-    GPU_NAME=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU")
-    GPU_VRAM=$(rocm-smi --showmeminfo vram 2>/dev/null | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0")
+    GPU_NAME=$(rocm-smi --showproductname 2>>"$LOGFILE" | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU")
+    GPU_VRAM=$(rocm-smi --showmeminfo vram 2>>"$LOGFILE" | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0")
     # Convert bytes to MiB
     if [[ "${GPU_VRAM:-0}" -gt 1000000 ]]; then
       GPU_VRAM=$(( GPU_VRAM / 1048576 ))
     fi
-    GPU_COUNT=$(rocm-smi --showid 2>/dev/null | grep -c 'GPU\[' || echo 1)
+    GPU_COUNT=$(rocm-smi --showid 2>>"$LOGFILE" | grep -c 'GPU\[' || echo 1)
     if [[ $GPU_COUNT -ge 2 ]]; then
       GPU_TOTAL_VRAM=$(( GPU_VRAM * GPU_COUNT ))  # rocm-smi per-device sum
     else
@@ -283,8 +283,8 @@ apply_post_install_fixes() {
   local data_dir="${ds_dir}/data"
   local env_file="${ds_dir}/.env"
   local cpu_count docker_cpu compose_ceiling max_cpu
-  cpu_count=$(nproc 2>/dev/null || echo 1)
-  docker_cpu=$(docker info --format '{{.NCPU}}' 2>/dev/null || echo "unknown")
+  cpu_count=$(nproc 2>>"$LOGFILE" || echo 1)
+  docker_cpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "unknown")
 
   [[ "$gpu_backend" == "auto" ]] && gpu_backend=$(detect_gpu_backend)
 
 
@@ -40,17 +40,17 @@ enumerate_gpus() {
       GPU_NAMES+=("$name")
       GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + ${vram%%.*} ))
     done < <(nvidia-smi --query-gpu=gpu_uuid,memory.total,name \
-      --format=csv,noheader,nounits 2>/dev/null || true)
+      --format=csv,noheader,nounits 2>>"$LOGFILE" || warn "nvidia-smi GPU enumeration failed (non-fatal)")
 
   elif [[ "${GPU_BACKEND:-}" == "amd" ]]; then
     local idx=0
     while IFS= read -r line; do
       [[ -z "$line" ]] && continue
       local gpu_name
-      gpu_name=$(rocm-smi -d "$idx" --showproductname 2>/dev/null \
+      gpu_name=$(rocm-smi -d "$idx" --showproductname 2>>"$LOGFILE" \
         | grep -oP 'Card series:\s*\K.*' || echo "AMD GPU $idx")
       local vram_bytes
-      vram_bytes=$(rocm-smi -d "$idx" --showmeminfo vram 2>/dev/null \
+      vram_bytes=$(rocm-smi -d "$idx" --showmeminfo vram 2>>"$LOGFILE" \
         | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' || echo "0")
       local vram_mb=$(( vram_bytes / 1048576 ))
       [[ $vram_mb -lt 1000 ]] && vram_mb=${GPU_VRAM:-0}  # fallback
@@ -60,7 +60,7 @@ enumerate_gpus() {
       GPU_NAMES+=("$gpu_name")
       GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + vram_mb ))
       idx=$((idx + 1))
-    done < <(rocm-smi --showid 2>/dev/null | grep 'GPU\[' || true)
+    done < <(rocm-smi --showid 2>>"$LOGFILE" | grep 'GPU\[' || echo "")
   fi
 
   # Sanity: if enumeration failed, fall back to count * per-GPU
@@ -85,8 +85,8 @@ generate_topology_json() {
       # Source upstream functions in subshell
       warn() { echo "WARN: $*" >&2; }
       err()  { echo "ERR: $*" >&2; }
-      source "${DS_DIR}/installers/lib/nvidia-topo.sh" 2>/dev/null
-      detect_nvidia_topo 2>/dev/null
+      source "${DS_DIR}/installers/lib/nvidia-topo.sh" 2>>"$LOGFILE"
+      detect_nvidia_topo 2>>"$LOGFILE"
     ) || upstream_topo=""
     if [[ -n "$upstream_topo" && "$upstream_topo" != "{}" ]]; then
       echo "$upstream_topo" > "$output_file"
@@ -133,7 +133,7 @@ TOPO_EOF
 _parse_nvidia_topo_links() {
   # Parse nvidia-smi topo -m matrix into JSON links array
   local matrix
-  matrix=$(nvidia-smi topo -m 2>/dev/null) || { echo "[]"; return; }
+  matrix=$(nvidia-smi topo -m 2>>"$LOGFILE") || { echo "[]"; return; }
 
   # Strip ANSI escape codes
   matrix=$(echo "$matrix" | sed 's/\x1b\[[0-9;]*m//g')
@@ -251,8 +251,8 @@ run_gpu_assignment() {
 
   # Save topology for dashboard-api
   mkdir -p "${ds_dir}/config"
-  cp "$topo_file" "${ds_dir}/config/gpu-topology.json" 2>/dev/null || true
-  chmod 644 "${ds_dir}/config/gpu-topology.json" 2>/dev/null || true
+  cp "$topo_file" "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to persist gpu-topology.json (non-fatal)"
+  chmod 644 "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to set mode on gpu-topology.json (non-fatal)"
 
   # Enable P2P transfers when NVLink detected (avoids host RAM round-trip)
   if [[ -f "$topo_file" ]] && jq -e '.links[] | select(.link_type | startswith("NV"))' "$topo_file" &>/dev/null; then
 
@@ -73,8 +73,8 @@ resolve_tier_for_gpu() {
       TIER="$tier"
       MODEL_PROFILE="${MODEL_PROFILE:-qwen}"
       error() { echo "ERROR: $*" >&2; return 1; }
-      source "$tier_map" 2>/dev/null
-      resolve_tier_config 2>/dev/null
+      source "$tier_map" 2>>"$LOGFILE"
+      resolve_tier_config 2>>"$LOGFILE"
       echo "${GGUF_FILE}|${GGUF_URL:-}|${LLM_MODEL_SIZE_MB:-0}"
     ) || result=""
 
@@ -136,7 +136,7 @@ check_disk_for_download() {
   local target_dir="$1"
   local min_gb="${2:-5}"
   local avail_gb
-  avail_gb=$(df -BG --output=avail "$target_dir" 2>/dev/null | tail -1 | tr -dc '0-9')
+  avail_gb=$(df -BG --output=avail "$target_dir" 2>>"$LOGFILE" | tail -1 | tr -dc '0-9')
   if [[ "${avail_gb:-0}" -lt "$min_gb" ]]; then
     warn "Insufficient disk space: ${avail_gb}GB available, ${min_gb}GB needed in ${target_dir}"
     return 1
@@ -148,7 +148,7 @@ check_disk_for_download() {
 # Store a background process PID so we can stop it safely later.
 _store_pid() {
   local name="$1" pid="$2"
-  mkdir -p "$PIDFILE_DIR" 2>/dev/null || true
+  mkdir -p "$PIDFILE_DIR" 2>>"$LOGFILE" || warn "could not create pidfile directory ${PIDFILE_DIR} (non-fatal)"
   echo "$pid" > "${PIDFILE_DIR}/${name}.pid"
 }
 
@@ -158,9 +158,9 @@ _kill_stored_pid() {
   local pidfile="${PIDFILE_DIR}/${name}.pid"
   [[ ! -f "$pidfile" ]] && return 0
   local pid
-  pid=$(cat "$pidfile" 2>/dev/null || echo "")
-  if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
-    kill "$pid" 2>/dev/null || warn "Could not kill ${name} (PID ${pid})"
+  pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "")
+  if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then  # stderr expected: process may already have exited
+    kill "$pid" 2>>"$LOGFILE" || warn "Could not kill ${name} (PID ${pid})"
   fi
   rm -f "$pidfile"
 }
@@ -171,8 +171,8 @@ _is_pid_running() {
   local pidfile="${PIDFILE_DIR}/${name}.pid"
   [[ ! -f "$pidfile" ]] && return 1
   local pid
-  pid=$(cat "$pidfile" 2>/dev/null || echo "")
-  [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null
+  pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "")
+  [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: process may already have exited
 }
 
 # Resolve download URL for a model filename
@@ -332,8 +332,8 @@ compose_cmd() {
 is_download_running() {
   [[ ! -f "$PIDFILE" ]] && return 1
   local pid
-  pid=$(cat "$PIDFILE" 2>/dev/null || echo "")
-  [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null
+  pid=$(cat "$PIDFILE" 2>/dev/null || echo "") # stderr expected: pidfile can be unreadable/missing during shutdown race
+  [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: "No such process" while download exits
 }
 
 swap_model() {
@@ -357,7 +357,7 @@ swap_model() {
     return 1
   fi
   local file_size
-  file_size=$(stat -c%s "$model_path" 2>/dev/null || echo 0)
+  file_size=$(stat -c%s "$model_path" 2>/dev/null || echo 0) # stderr expected: file can disappear during concurrent cleanup
   if [[ "$file_size" -lt 100000000 ]]; then
     warn "Model file too small (${file_size} bytes) — skipping swap"
     return 1
 
@@ -268,7 +268,7 @@ setup_cloudflare_tunnel() {
       -o "$cf_tmp" || { warn "cloudflared download failed (non-fatal)"; rm -f "$cf_tmp"; return 0; }
     # Verify checksum when available
     local expected_sha
-    expected_sha=$(curl -sL --max-time 10 "$cf_checksum_url" 2>/dev/null | awk '{print $1}' || echo "")
+    expected_sha=$(curl -sL --max-time 10 "$cf_checksum_url" 2>>"$LOGFILE" | awk '{print $1}' || echo "")
     if [[ -n "$expected_sha" ]]; then
       local actual_sha
       actual_sha=$(sha256sum "$cf_tmp" | awk '{print $1}')
@@ -295,7 +295,7 @@ setup_cloudflare_tunnel() {
   TUNNEL_TOKEN="$cf_token" nohup cloudflared tunnel --no-autoupdate run --token-from-env TUNNEL_TOKEN \
     >> "${ds_dir}/logs/cloudflared.log" 2>&1 &
   local cf_pid=$!
-  _store_pid "cloudflared" "$cf_pid" 2>/dev/null || true
+  _store_pid "cloudflared" "$cf_pid" 2>>"$LOGFILE" || warn "could not persist cloudflared pid (non-fatal)"
   log "Cloudflare Tunnel started (PID: ${cf_pid}) — HTTPS access active"
 }
 
@@ -319,7 +319,7 @@ generate_ssh_tunnel_script() {
     echo "HOST=\"${host_ip}\""
     echo "SSH_PORT=\"${ssh_port}\""
     echo "ENTRY_PORT=\"${entry_port}\""
-    echo '_uname="$(uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")"'
+    echo '_uname="$(uname -s | tr "[:upper:]" "[:lower:]")"'
     echo 'case "${_uname}" in'
     echo "  mingw*|msys*|cygwin*) _default_local_proxy=${local_proxy_port} ;;"
     echo "  *) _default_local_proxy=${local_proxy_port} ;;"
@@ -402,11 +402,13 @@ print_access_info() {
   local dash_api_status dashboard_status webui_status
   host_ip="${PUBLIC_IPADDR:-$(curl -sf --max-time 5 ifconfig.me || echo '<your-vast-ip>')}"
   ssh_port="${VAST_TCP_PORT_22:-22}"
-  dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing")
-  dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing")
-  webui_status=$(docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null \
-    || docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null \
-    || echo "missing")
+  dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist
+  dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist
+  webui_status=$(
+    docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null || # stderr expected: container may not exist
+    docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null || # stderr expected: container may not exist
+    echo "missing"
+  )
 
   echo ""
   if [[ "$dash_api_status" == "running" && ( "$dashboard_status" == "running" || "$webui_status" == "running" ) ]]; then