Skip to content

Commit c07ef0c

Browse files
Update p2p-gpu hardening and docs
1 parent 6043119 commit c07ef0c

File tree

12 files changed

+104
-87
lines changed

12 files changed

+104
-87
lines changed

.github/workflows/p2p-gpu.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
name: P2P GPU Checks
2+
# Maintainer note: This CI is scoped to resources/p2p-gpu/** paths only and
3+
# runs zero tests against core DreamServer. It exists because the toolkit is
4+
# ~4,500 lines of bash executed as root on rented GPU instances — syntax and
5+
# lint correctness matter more here than in typical resources/ add-ons.
6+
# The smoke tests use a mock Docker binary; no real containers are started.
27

38
on:
49
pull_request:

resources/p2p-gpu/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,12 @@ p2p-gpu/
113113

114114
Aligned with DreamServer's [CLAUDE.md](../../CLAUDE.md):
115115

116-
- **Let It Crash**`set -euo pipefail` everywhere; errors kill the process
116+
- **Let It Crash**`set -euo pipefail` everywhere; errors crash unless explicitly marked non-fatal with `|| warn` (per CLAUDE.md). On rented GPU instances, a working dashboard with a broken ComfyUI beats a dead stack the user is paying for.
117117
- **KISS** — readable over clever; one function, one job
118118
- **Pure Functions** — libs have no side effects; phases are the imperative shell
119119
- **Manifest-Driven** — services auto-discovered from extension manifests (no hardcoded lists)
120120
- **PID-file tracking** — background processes tracked safely (no `pkill -f`)
121-
- **ACL-primary permissions** — setgid + POSIX ACLs; `chmod a+rwX` only as documented fallback
121+
- **ACL-primary permissions** — setgid + POSIX ACLs required (hard fail if unavailable); `chmod a+rwX` only for documented multi-UID directories where ACLs cannot express the pattern
122122

123123
## Commands
124124

@@ -139,7 +139,7 @@ Aligned with DreamServer's [CLAUDE.md](../../CLAUDE.md):
139139
- Dashboard model downloads (`/models` page) require the Dream host agent; setup now auto-starts it during service startup.
140140

141141
```bash
142-
MODEL="Qwen3-30B-A3B-Q4_K_M.gguf"; DS_DIR="${DS_DIR:-/home/dream/dream-server}"; LLM_MODEL="$(echo "$MODEL" | sed -E 's/\.(gguf|GGUF)$//' | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' | tr '[:upper:]' '[:lower:]')"; cd "$DS_DIR" && sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${MODEL}|" .env && { grep -q '^LLM_MODEL=' .env && sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${LLM_MODEL}|" .env || echo "LLM_MODEL=${LLM_MODEL}" >> .env; } && docker compose $(cat .compose-flags 2>/dev/null) up -d llama-server && for c in dream-dreamforge dream-openclaw dream-dashboard-api dream-webui; do docker ps --format '{{.Names}}' | grep -qx "$c" && docker restart "$c" >/dev/null || true; done
142+
MODEL="Qwen3-30B-A3B-Q4_K_M.gguf"; DS_DIR="${DS_DIR:-/home/dream/dream-server}"; LLM_MODEL="$(echo "$MODEL" | sed -E 's/\.(gguf|GGUF)$//' | sed -E 's/-Q[0-9]+([._][A-Za-z0-9]+)*$//' | tr '[:upper:]' '[:lower:]')"; cd "$DS_DIR" && sed -i "s|^GGUF_FILE=.*|GGUF_FILE=${MODEL}|" .env && { grep -q '^LLM_MODEL=' .env && sed -i "s|^LLM_MODEL=.*|LLM_MODEL=${LLM_MODEL}|" .env || echo "LLM_MODEL=${LLM_MODEL}" >> .env; } && docker compose $(cat .compose-flags 2>/dev/null) up -d llama-server && for c in dream-dreamforge dream-openclaw dream-dashboard-api dream-webui; do docker ps --format '{{.Names}}' | grep -qx "$c" && docker restart "$c" >/dev/null || echo "[warn] ${c} restart failed (non-fatal)" >&2; done
143143
```
144144

145145
```bash

resources/p2p-gpu/lib/environment.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ env_set() {
4747
env_get() {
4848
local file="$1" key="$2"
4949
[[ ! -f "$file" ]] && return 0
50-
grep "^${key}=" "$file" 2>/dev/null | head -1 | cut -d= -f2- \
50+
grep "^${key}=" "$file" 2>>"$LOGFILE" | head -1 | cut -d= -f2- \
5151
| sed 's/[[:space:]]#.*$//' | tr -d '"' | tr -d "'" || echo ""
5252
}
5353

@@ -168,13 +168,13 @@ cap_cpu_in_files() {
168168
get_compose_cpu_ceiling() {
169169
local host_nproc docker_ncpu ceiling
170170

171-
host_nproc=$(nproc 2>/dev/null || echo 1)
171+
host_nproc=$(nproc 2>>"$LOGFILE" || echo 1)
172172
if [[ ! "$host_nproc" =~ ^[0-9]+$ ]] || [[ "$host_nproc" -lt 1 ]]; then
173173
host_nproc=1
174174
fi
175175

176176
ceiling="$host_nproc"
177-
docker_ncpu=$(docker info --format '{{.NCPU}}' 2>/dev/null || echo "")
177+
docker_ncpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "")
178178
if [[ "$docker_ncpu" =~ ^[0-9]+$ ]] && [[ "$docker_ncpu" -gt 0 ]] && [[ "$docker_ncpu" -lt "$ceiling" ]]; then
179179
ceiling="$docker_ncpu"
180180
fi
@@ -238,23 +238,23 @@ detect_gpu() {
238238

239239
if command -v nvidia-smi &>/dev/null && nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null 2>&1; then
240240
GPU_BACKEND="nvidia"
241-
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | xargs)
242-
GPU_VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs)
243-
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l)
241+
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | head -1 | xargs)
242+
GPU_VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE" | head -1 | xargs)
243+
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>>"$LOGFILE" | wc -l)
244244
GPU_TOTAL_VRAM=0
245245
while read -r v; do GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + v )); done \
246-
< <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null)
246+
< <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>>"$LOGFILE")
247247
if [[ $GPU_TOTAL_VRAM -eq 0 ]]; then GPU_TOTAL_VRAM=$GPU_VRAM; fi
248248

249249
elif command -v rocm-smi &>/dev/null || [[ -e /dev/kfd ]]; then
250250
GPU_BACKEND="amd"
251-
GPU_NAME=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU")
252-
GPU_VRAM=$(rocm-smi --showmeminfo vram 2>/dev/null | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0")
251+
GPU_NAME=$(rocm-smi --showproductname 2>>"$LOGFILE" | grep -oP 'Card series:\s*\K.*' | head -1 || echo "AMD GPU")
252+
GPU_VRAM=$(rocm-smi --showmeminfo vram 2>>"$LOGFILE" | grep -oP 'Total Memory \(B\):\s*\K[0-9]+' | head -1 || echo "0")
253253
# Convert bytes to MiB
254254
if [[ "${GPU_VRAM:-0}" -gt 1000000 ]]; then
255255
GPU_VRAM=$(( GPU_VRAM / 1048576 ))
256256
fi
257-
GPU_COUNT=$(rocm-smi --showid 2>/dev/null | grep -c 'GPU\[' || echo 1)
257+
GPU_COUNT=$(rocm-smi --showid 2>>"$LOGFILE" | grep -c 'GPU\[' || echo 1)
258258
if [[ $GPU_COUNT -ge 2 ]]; then
259259
GPU_TOTAL_VRAM=$(( GPU_VRAM * GPU_COUNT )) # rocm-smi per-device sum
260260
else
@@ -283,8 +283,8 @@ apply_post_install_fixes() {
283283
local data_dir="${ds_dir}/data"
284284
local env_file="${ds_dir}/.env"
285285
local cpu_count docker_cpu compose_ceiling max_cpu
286-
cpu_count=$(nproc 2>/dev/null || echo 1)
287-
docker_cpu=$(docker info --format '{{.NCPU}}' 2>/dev/null || echo "unknown")
286+
cpu_count=$(nproc 2>>"$LOGFILE" || echo 1)
287+
docker_cpu=$(docker info --format '{{.NCPU}}' 2>>"$LOGFILE" || echo "unknown")
288288

289289
[[ "$gpu_backend" == "auto" ]] && gpu_backend=$(detect_gpu_backend)
290290

resources/p2p-gpu/lib/gpu-topology.sh

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,17 @@ enumerate_gpus() {
4040
GPU_NAMES+=("$name")
4141
GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + ${vram%%.*} ))
4242
done < <(nvidia-smi --query-gpu=gpu_uuid,memory.total,name \
43-
--format=csv,noheader,nounits 2>/dev/null || true)
43+
--format=csv,noheader,nounits 2>>"$LOGFILE" || warn "nvidia-smi GPU enumeration failed (non-fatal)")
4444

4545
elif [[ "${GPU_BACKEND:-}" == "amd" ]]; then
4646
local idx=0
4747
while IFS= read -r line; do
4848
[[ -z "$line" ]] && continue
4949
local gpu_name
50-
gpu_name=$(rocm-smi -d "$idx" --showproductname 2>/dev/null \
50+
gpu_name=$(rocm-smi -d "$idx" --showproductname 2>>"$LOGFILE" \
5151
| grep -oP 'Card series:\s*\K.*' || echo "AMD GPU $idx")
5252
local vram_bytes
53-
vram_bytes=$(rocm-smi -d "$idx" --showmeminfo vram 2>/dev/null \
53+
vram_bytes=$(rocm-smi -d "$idx" --showmeminfo vram 2>>"$LOGFILE" \
5454
| grep -oP 'Total Memory \(B\):\s*\K[0-9]+' || echo "0")
5555
local vram_mb=$(( vram_bytes / 1048576 ))
5656
[[ $vram_mb -lt 1000 ]] && vram_mb=${GPU_VRAM:-0} # fallback
@@ -60,7 +60,7 @@ enumerate_gpus() {
6060
GPU_NAMES+=("$gpu_name")
6161
GPU_TOTAL_VRAM=$(( GPU_TOTAL_VRAM + vram_mb ))
6262
idx=$((idx + 1))
63-
done < <(rocm-smi --showid 2>/dev/null | grep 'GPU\[' || true)
63+
done < <(rocm-smi --showid 2>>"$LOGFILE" | grep 'GPU\[' || echo "")
6464
fi
6565

6666
# Sanity: if enumeration failed, fall back to count * per-GPU
@@ -85,8 +85,8 @@ generate_topology_json() {
8585
# Source upstream functions in subshell
8686
warn() { echo "WARN: $*" >&2; }
8787
err() { echo "ERR: $*" >&2; }
88-
source "${DS_DIR}/installers/lib/nvidia-topo.sh" 2>/dev/null
89-
detect_nvidia_topo 2>/dev/null
88+
source "${DS_DIR}/installers/lib/nvidia-topo.sh" 2>>"$LOGFILE"
89+
detect_nvidia_topo 2>>"$LOGFILE"
9090
) || upstream_topo=""
9191
if [[ -n "$upstream_topo" && "$upstream_topo" != "{}" ]]; then
9292
echo "$upstream_topo" > "$output_file"
@@ -133,7 +133,7 @@ TOPO_EOF
133133
_parse_nvidia_topo_links() {
134134
# Parse nvidia-smi topo -m matrix into JSON links array
135135
local matrix
136-
matrix=$(nvidia-smi topo -m 2>/dev/null) || { echo "[]"; return; }
136+
matrix=$(nvidia-smi topo -m 2>>"$LOGFILE") || { echo "[]"; return; }
137137

138138
# Strip ANSI escape codes
139139
matrix=$(echo "$matrix" | sed 's/\x1b\[[0-9;]*m//g')
@@ -251,8 +251,8 @@ run_gpu_assignment() {
251251

252252
# Save topology for dashboard-api
253253
mkdir -p "${ds_dir}/config"
254-
cp "$topo_file" "${ds_dir}/config/gpu-topology.json" 2>/dev/null || true
255-
chmod 644 "${ds_dir}/config/gpu-topology.json" 2>/dev/null || true
254+
cp "$topo_file" "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to persist gpu-topology.json (non-fatal)"
255+
chmod 644 "${ds_dir}/config/gpu-topology.json" 2>>"$LOGFILE" || warn "failed to set mode on gpu-topology.json (non-fatal)"
256256

257257
# Enable P2P transfers when NVLink detected (avoids host RAM round-trip)
258258
if [[ -f "$topo_file" ]] && jq -e '.links[] | select(.link_type | startswith("NV"))' "$topo_file" &>/dev/null; then

resources/p2p-gpu/lib/models.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ resolve_tier_for_gpu() {
7373
TIER="$tier"
7474
MODEL_PROFILE="${MODEL_PROFILE:-qwen}"
7575
error() { echo "ERROR: $*" >&2; return 1; }
76-
source "$tier_map" 2>/dev/null
77-
resolve_tier_config 2>/dev/null
76+
source "$tier_map" 2>>"$LOGFILE"
77+
resolve_tier_config 2>>"$LOGFILE"
7878
echo "${GGUF_FILE}|${GGUF_URL:-}|${LLM_MODEL_SIZE_MB:-0}"
7979
) || result=""
8080

@@ -136,7 +136,7 @@ check_disk_for_download() {
136136
local target_dir="$1"
137137
local min_gb="${2:-5}"
138138
local avail_gb
139-
avail_gb=$(df -BG --output=avail "$target_dir" 2>/dev/null | tail -1 | tr -dc '0-9')
139+
avail_gb=$(df -BG --output=avail "$target_dir" 2>>"$LOGFILE" | tail -1 | tr -dc '0-9')
140140
if [[ "${avail_gb:-0}" -lt "$min_gb" ]]; then
141141
warn "Insufficient disk space: ${avail_gb}GB available, ${min_gb}GB needed in ${target_dir}"
142142
return 1
@@ -148,7 +148,7 @@ check_disk_for_download() {
148148
# Store a background process PID so we can stop it safely later.
149149
_store_pid() {
150150
local name="$1" pid="$2"
151-
mkdir -p "$PIDFILE_DIR" 2>/dev/null || true
151+
mkdir -p "$PIDFILE_DIR" 2>>"$LOGFILE" || warn "could not create pidfile directory ${PIDFILE_DIR} (non-fatal)"
152152
echo "$pid" > "${PIDFILE_DIR}/${name}.pid"
153153
}
154154

@@ -158,9 +158,9 @@ _kill_stored_pid() {
158158
local pidfile="${PIDFILE_DIR}/${name}.pid"
159159
[[ ! -f "$pidfile" ]] && return 0
160160
local pid
161-
pid=$(cat "$pidfile" 2>/dev/null || echo "")
162-
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
163-
kill "$pid" 2>/dev/null || warn "Could not kill ${name} (PID ${pid})"
161+
pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "")
162+
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then # stderr expected: process may already have exited
163+
kill "$pid" 2>>"$LOGFILE" || warn "Could not kill ${name} (PID ${pid})"
164164
fi
165165
rm -f "$pidfile"
166166
}
@@ -171,8 +171,8 @@ _is_pid_running() {
171171
local pidfile="${PIDFILE_DIR}/${name}.pid"
172172
[[ ! -f "$pidfile" ]] && return 1
173173
local pid
174-
pid=$(cat "$pidfile" 2>/dev/null || echo "")
175-
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null
174+
pid=$(cat "$pidfile" 2>>"$LOGFILE" || echo "")
175+
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: process may already have exited
176176
}
177177

178178
# Resolve download URL for a model filename
@@ -332,8 +332,8 @@ compose_cmd() {
332332
is_download_running() {
333333
[[ ! -f "$PIDFILE" ]] && return 1
334334
local pid
335-
pid=$(cat "$PIDFILE" 2>/dev/null || echo "")
336-
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null
335+
pid=$(cat "$PIDFILE" 2>/dev/null || echo "") # stderr expected: pidfile can be unreadable/missing during shutdown race
336+
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null # stderr expected: "No such process" while download exits
337337
}
338338
339339
swap_model() {
@@ -357,7 +357,7 @@ swap_model() {
357357
return 1
358358
fi
359359
local file_size
360-
file_size=$(stat -c%s "$model_path" 2>/dev/null || echo 0)
360+
file_size=$(stat -c%s "$model_path" 2>/dev/null || echo 0) # stderr expected: file can disappear during concurrent cleanup
361361
if [[ "$file_size" -lt 100000000 ]]; then
362362
warn "Model file too small (${file_size} bytes) — skipping swap"
363363
return 1

resources/p2p-gpu/lib/networking.sh

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ setup_cloudflare_tunnel() {
268268
-o "$cf_tmp" || { warn "cloudflared download failed (non-fatal)"; rm -f "$cf_tmp"; return 0; }
269269
# Verify checksum when available
270270
local expected_sha
271-
expected_sha=$(curl -sL --max-time 10 "$cf_checksum_url" 2>/dev/null | awk '{print $1}' || echo "")
271+
expected_sha=$(curl -sL --max-time 10 "$cf_checksum_url" 2>>"$LOGFILE" | awk '{print $1}' || echo "")
272272
if [[ -n "$expected_sha" ]]; then
273273
local actual_sha
274274
actual_sha=$(sha256sum "$cf_tmp" | awk '{print $1}')
@@ -295,7 +295,7 @@ setup_cloudflare_tunnel() {
295295
TUNNEL_TOKEN="$cf_token" nohup cloudflared tunnel --no-autoupdate run --token-from-env TUNNEL_TOKEN \
296296
>> "${ds_dir}/logs/cloudflared.log" 2>&1 &
297297
local cf_pid=$!
298-
_store_pid "cloudflared" "$cf_pid" 2>/dev/null || true
298+
_store_pid "cloudflared" "$cf_pid" 2>>"$LOGFILE" || warn "could not persist cloudflared pid (non-fatal)"
299299
log "Cloudflare Tunnel started (PID: ${cf_pid}) — HTTPS access active"
300300
}
301301

@@ -319,7 +319,7 @@ generate_ssh_tunnel_script() {
319319
echo "HOST=\"${host_ip}\""
320320
echo "SSH_PORT=\"${ssh_port}\""
321321
echo "ENTRY_PORT=\"${entry_port}\""
322-
echo '_uname="$(uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")"'
322+
echo '_uname="$(uname -s | tr "[:upper:]" "[:lower:]")"'
323323
echo 'case "${_uname}" in'
324324
echo " mingw*|msys*|cygwin*) _default_local_proxy=${local_proxy_port} ;;"
325325
echo " *) _default_local_proxy=${local_proxy_port} ;;"
@@ -402,11 +402,13 @@ print_access_info() {
402402
local dash_api_status dashboard_status webui_status
403403
host_ip="${PUBLIC_IPADDR:-$(curl -sf --max-time 5 ifconfig.me || echo '<your-vast-ip>')}"
404404
ssh_port="${VAST_TCP_PORT_22:-22}"
405-
dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing")
406-
dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing")
407-
webui_status=$(docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null \
408-
|| docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null \
409-
|| echo "missing")
405+
dash_api_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard-api 2>/dev/null || echo "missing") # stderr expected: container may not exist
406+
dashboard_status=$(docker inspect --format '{{.State.Status}}' dream-dashboard 2>/dev/null || echo "missing") # stderr expected: container may not exist
407+
webui_status=$(
408+
docker inspect --format '{{.State.Status}}' dream-webui 2>/dev/null || # stderr expected: container may not exist
409+
docker inspect --format '{{.State.Status}}' dream-open-webui 2>/dev/null || # stderr expected: container may not exist
410+
echo "missing"
411+
)
410412

411413
echo ""
412414
if [[ "$dash_api_status" == "running" && ( "$dashboard_status" == "running" || "$webui_status" == "running" ) ]]; then

0 commit comments

Comments
 (0)