-
Notifications
You must be signed in to change notification settings - Fork 72
Expand file tree
/
Copy pathhf-xet-diag-linux.sh
More file actions
executable file
·297 lines (255 loc) · 9.14 KB
/
hf-xet-diag-linux.sh
File metadata and controls
executable file
·297 lines (255 loc) · 9.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/usr/bin/env bash
# hf-xet-diag.sh — Linux-only diagnostics runner
# Runs a target command, periodically snapshots stacks, detects hangs, and can dump cores.
# Output directory defaults to include mangled command string for easy correlation.
set -Eeuo pipefail
# Defaults
INTERVAL=120
OUTDIR=""
PRELOAD_HELPER=true
OUTDIR_SET=""
print_usage() {
cat <<'USAGE'
Usage: hf-xet-diag-linux.sh [options] -- <command> [args...]
Runs a target command, periodically snapshots stacks, detects hangs, and can dump cores.
Output directory defaults to include mangled command string for easy correlation.
Uses gdb for stack snapshots and hang detection.
Requires gdb, gcc, gcore. Install on Linux with:
sudo apt-get install gdb build-essential
Options:
-i, --interval SECONDS Stack snapshot cadence (default: 120)
-o, --outdir DIR Output directory (default: diag_<CMD>_<timestamp>)
--no-preload Do NOT preload ptrace-bypass helper
-h, --help Show this help
Examples:
./hf-xet-diag-linux.sh -- python hfxet-test.py "Qwen/Qwen2.5-VL-3B-Instruct"
./hf-xet-diag-linux.sh -i 30 -o diag -- ./server --port 8080
USAGE
}
# --- option parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
-i|--interval) INTERVAL="${2:-}"; shift 2 ;;
-o|--outdir) OUTDIR="${2:-}"; OUTDIR_SET=1; shift 2 ;;
--no-preload) PRELOAD_HELPER=false; shift ;;
-h|--help) print_usage; exit 0 ;;
--) shift; break ;;
*) break ;;
esac
done
if [[ $# -lt 1 ]]; then
echo "ERROR: No command provided."
print_usage; exit 2
fi
CMD=( "$@" )
missing=()
for cmd in gdb gcore; do
if ! command -v "$cmd" >/dev/null 2>&1; then
missing+=("$cmd")
fi
done
if [ ${#missing[@]} -ne 0 ]; then
echo "Missing required tools: ${missing[*]}"
echo ""
echo "Requires gdb & gcore. Install on Linux with:"
echo " sudo apt-get install gdb"
exit 2
fi
# If no outdir given, generate one based on command
if [[ -z "$OUTDIR_SET" ]]; then
CMD_STR="${CMD[*]}"
SAFE_CMD=$(echo "$CMD_STR" | tr -cs 'A-Za-z0-9._-' '_' )
OUTDIR="diag_${SAFE_CMD}_$(date +%Y%m%d%H%M%S)"
fi
mkdir -p "$OUTDIR"/{stacks,dumps}
CONSOLE_LOG="$OUTDIR/console.log"
ENV_LOG="$OUTDIR/env.log"
PID_FILE="$OUTDIR/pid"
echo "Diagnostics output: $OUTDIR"
echo "Stack trace interval: ${INTERVAL}s"
echo "Command: ${CMD[*]}"
# --- collect some quick system info ---
{
echo "=== $(date -Is) ==="
echo "uname -a:"; uname -a
echo
echo "top snapshot:"; top -b -n 1 | grep -E "^%Cpu|^MiB Mem|^MiB Swap" || true
echo
echo "ulimit -a:"; ulimit -a || true
echo
echo "python version:"; python -VV || true
echo
} > "$ENV_LOG" 2>&1 || true
# --- ptrace helper build (optional) ---
ALLOW_PTRACE_SO="/tmp/liballow_ptrace.so"
maybe_build_ptrace_helper() {
[[ "$PRELOAD_HELPER" == true ]] || return 0
[[ -f "$ALLOW_PTRACE_SO" ]] && return 0
if ! command -v gcc >/dev/null 2>&1; then
echo "Note: gcc not found; skipping ptrace helper." | tee -a "$CONSOLE_LOG"
return 0
fi
cat >/tmp/allow-ptrace.c <<'EOF'
#define _GNU_SOURCE
#include <sys/prctl.h>
#ifndef PR_SET_PTRACER
#define PR_SET_PTRACER 0x59616d61
#endif
#ifndef PR_SET_PTRACER_ANY
#define PR_SET_PTRACER_ANY ((unsigned long)-1)
#endif
__attribute__((constructor))
static void allow_ptrace_ctor(void) {
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
}
EOF
gcc -shared -fPIC -O2 -o "$ALLOW_PTRACE_SO" /tmp/allow-ptrace.c 2>>"$CONSOLE_LOG" || {
echo "Warning: failed to build ptrace helper; proceeding without." | tee -a "$CONSOLE_LOG"
return
}
echo "Built ptrace helper: $ALLOW_PTRACE_SO" | tee -a "$CONSOLE_LOG"
}
maybe_build_ptrace_helper
# --- download hf-xet dbg symbols ---
WHEEL_VERSION=$(pip show hf-xet | awk '/^Version:/{printf $2}')
if [ -z "$WHEEL_VERSION" ]; then
echo "Error: hf-xet package is not installed. Please install it before running this script." >&2
exit 1
fi
echo "hf-xet wheel version: $WHEEL_VERSION"
SYMBOL_DIR="symbols-$WHEEL_VERSION"
if [ -d "$SYMBOL_DIR" ]; then
echo "Existing symbols dir found, assuming previously installed."
else
SITE_PACKAGES="$(pip show hf-xet | awk '/^Location:/{printf $2}')"
WHEEL_DIR="$SITE_PACKAGES/hf_xet"
DIST_INFO="$SITE_PACKAGES/hf_xet-$WHEEL_VERSION.dist-info"
WHEEL_FILE="$DIST_INFO/WHEEL"
# Reconstruct wheel name from wheel version and wheel tag
WHEEL_TAG=$(awk '/^Tag:/{printf $2}' $WHEEL_FILE)
SYMBOL_FILENAME="hf_xet-$WHEEL_VERSION-$WHEEL_TAG.so.dbg"
echo "Downloading debug symbols: $SYMBOL_FILENAME"
# If the version is of format "1.1.10rc0", change it to our release tag format like "1.1.10-rc0"
RELEASE_TAG=$(echo -n "$WHEEL_VERSION" | sed 's/\([0-9]\)\(rc.*\)$/\1-\2/')
DOWNLOAD_URL="https://github.com/huggingface/xet-core/releases/download/v${RELEASE_TAG}/dbg-symbols.zip"
curl -fL "$DOWNLOAD_URL" -o dbg-symbols.zip
if [ $? -ne 0 ]; then
echo "Error: Failed to download debug symbols from $DOWNLOAD_URL" >&2
exit 1
fi
# Extract just the needed symbol file
unzip dbg-symbols.zip -d "$SYMBOL_DIR"
# Copy to package directory
cp -r "$SYMBOL_DIR/dbg-symbols/$SYMBOL_FILENAME" "$WHEEL_DIR/"
echo "Installed dbg symbol $SYMBOL_FILENAME to $WHEEL_DIR"
fi
# --- launch target ---
SCRIPT_START_TIME=$(date +%s)
echo "Launching target at $(date -Is) ..." | tee -a "$CONSOLE_LOG"
LAUNCH_ENV=()
if [[ "$PRELOAD_HELPER" == true && -f "$ALLOW_PTRACE_SO" ]]; then
LAUNCH_ENV=( "LD_PRELOAD=$ALLOW_PTRACE_SO${LD_PRELOAD:+:$LD_PRELOAD}" )
echo "Using LD_PRELOAD=$ALLOW_PTRACE_SO to relax ptrace restrictions." | tee -a "$CONSOLE_LOG"
fi
if [[ ${#LAUNCH_ENV[@]} -gt 0 ]]; then
(
env "${LAUNCH_ENV[@]}" "${CMD[@]}" & echo $! > "$PID_FILE"
) 2>&1 | tee -a "$CONSOLE_LOG" &
else
(
"${CMD[@]}" & echo $! > "$PID_FILE"
) 2>&1 | tee -a "$CONSOLE_LOG" &
fi
LOGGER_BG=$!
# read PID
for _ in {1..50}; do
[[ -s "$PID_FILE" ]] && break
sleep 0.1
done
if [[ ! -s "$PID_FILE" ]]; then
echo "ERROR: Could not determine child PID." | tee -a "$CONSOLE_LOG"
exit 1
fi
TARGET_PID="$(cat "$PID_FILE")"
echo "Started PID: $TARGET_PID" | tee -a "$CONSOLE_LOG"
# --- stack capture + hang detection ---
declare -a LAST_STACKS=()
capture_stack() {
local ts stack_file
ts="$(date +%Y%m%d%H%M%S)"
stack_file="$OUTDIR/stacks/stack_${ts}.txt"
if command -v gdb >/dev/null 2>&1; then
gdb -p "$TARGET_PID" -batch \
-ex "set pagination off" \
-ex "thread apply all bt full" >"$stack_file" 2>&1 || true
elif command -v eu-stack >/dev/null 2>&1; then
eu-stack -p "$TARGET_PID" >"$stack_file" 2>&1 || true
elif command -v pstack >/dev/null 2>&1; then
pstack "$TARGET_PID" >"$stack_file" 2>&1 || true
else
for t in /proc/"$TARGET_PID"/task/*/stack; do
echo "=== $t ==="; cat "$t"; echo
done >"$stack_file" 2>&1 || true
fi
echo "$(date -Is) captured stack -> $stack_file" | tee -a "$CONSOLE_LOG"
LAST_STACKS+=("$stack_file")
if (( ${#LAST_STACKS[@]} > 3 )); then
LAST_STACKS=("${LAST_STACKS[@]: -3}")
fi
check_hang
}
check_hang() {
if (( ${#LAST_STACKS[@]} < 3 )); then return; fi
norm1=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[0]}" | grep -v '^$')
norm2=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[1]}" | grep -v '^$')
norm3=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[2]}" | grep -v '^$')
diff12=$(diff <(echo "$norm1") <(echo "$norm2") || true)
diff23=$(diff <(echo "$norm2") <(echo "$norm3") || true)
# if no diff between stacks 1-2 and 2-3, we have a hang
if [[ -n "$diff12" || -n "$diff23" ]]; then
return
fi
echo "⚠️ Hang heuristic triggered at $(date -Is)" | tee -a "$CONSOLE_LOG"
take_core_dump
LAST_STACKS=()
}
take_core_dump() {
local ts core_file
ts="$(date +%Y%m%d%H%M%S)"
core_file="$OUTDIR/dumps/core_${ts}"
if command -v gcore >/dev/null 2>&1; then
gcore -o "$core_file" "$TARGET_PID" >>"$CONSOLE_LOG" 2>&1 || true
echo "Core dump saved: ${core_file}.${TARGET_PID}" | tee -a "$CONSOLE_LOG"
else
echo "gcore not available, saving partial /proc/$TARGET_PID/mem dump" | tee -a "$CONSOLE_LOG"
dd if="/proc/$TARGET_PID/mem" of="${core_file}.raw" bs=1M count=50 status=none 2>>"$CONSOLE_LOG" || true
echo "Partial raw dump saved: ${core_file}.raw" | tee -a "$CONSOLE_LOG"
fi
}
# --- monitoring loop ---
LAST_SNAPSHOT_AT=0
while kill -0 "$TARGET_PID" 2>/dev/null; do
now=$(date +%s)
if (( now - LAST_SNAPSHOT_AT >= INTERVAL )); then
capture_stack || true
LAST_SNAPSHOT_AT=$now
fi
sleep 1
done
echo "Process $TARGET_PID has exited at $(date -Is)." | tee -a "$CONSOLE_LOG"
# --- collect xet log files from this execution ---
HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"
XET_LOG_DIR="$HF_HOME/xet/logs"
if [[ -d "$XET_LOG_DIR" ]]; then
echo "Collecting xet logs from $XET_LOG_DIR ..." | tee -a "$CONSOLE_LOG"
mkdir -p "$OUTDIR/xet_logs"
# Find log files created during or after script start time using GNU find
find "$XET_LOG_DIR" -name "xet_*.log" -type f -newermt "@$SCRIPT_START_TIME" 2>/dev/null | while read -r logfile; do
cp "$logfile" "$OUTDIR/xet_logs/" 2>/dev/null && \
echo " Copied: $(basename "$logfile")" | tee -a "$CONSOLE_LOG"
done
else
echo "No xet log directory found at $XET_LOG_DIR" | tee -a "$CONSOLE_LOG"
fi
echo "Logs and stacks are in: $OUTDIR"
disown "$LOGGER_BG" 2>/dev/null || true