-
Notifications
You must be signed in to change notification settings - Fork 72
Expand file tree
/
Copy pathhf-xet-diag-macos.sh
More file actions
executable file
·239 lines (201 loc) · 7.28 KB
/
hf-xet-diag-macos.sh
File metadata and controls
executable file
·239 lines (201 loc) · 7.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env bash
# hf-xet-diag-macos.sh — macOS-only diagnostics runner
# Runs a target command, periodically snapshots stacks with `sample`,
# detects hangs, and can dump cores with `lldb`.
# Installs hf-xet debug symbols if available.
# Output directory defaults to include mangled command string for easy correlation.
set -Eeuo pipefail
# Defaults
INTERVAL=120
OUTDIR=""
OUTDIR_SET=""
print_usage() {
cat <<'USAGE'
Usage: hf-xet-diag-macos.sh [options] -- <command> [args...]
Runs a target command, periodically snapshots stacks (via `sample`),
detects hangs, and can dump cores (via `lldb`).
Also installs hf-xet debug symbols if available.
Requires: sample, lldb (Xcode Command Line Tools), curl, unzip, pip
Options:
-i, --interval SECONDS Stack snapshot cadence (default: 120)
-o, --outdir DIR Output directory (default: diag_<CMD>_<timestamp>)
-h, --help Show this help
Examples:
./hf-xet-diag-macos.sh -- python hfxet-test.py "Qwen/Qwen2.5-VL-3B-Instruct"
./hf-xet-diag-macos.sh -i 60 -- ./myapp --flag
USAGE
}
# --- option parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
-i|--interval) INTERVAL="${2:-}"; shift 2 ;;
-o|--outdir) OUTDIR="${2:-}"; OUTDIR_SET=1; shift 2 ;;
-h|--help) print_usage; exit 0 ;;
--) shift; break ;;
*) break ;;
esac
done
if [[ $# -lt 1 ]]; then
echo "ERROR: No command provided."
print_usage; exit 2
fi
CMD=( "$@" )
# Tool availability check
missing=()
for cmd in sample lldb curl unzip; do
if ! command -v "$cmd" >/dev/null 2>&1; then
missing+=("$cmd")
fi
done
if [ ${#missing[@]} -ne 0 ]; then
echo "Missing required tools: ${missing[*]}"
echo "Install Xcode Command Line Tools with:"
echo " xcode-select --install"
exit 2
fi
# If no outdir given, generate one based on command
if [[ -z "$OUTDIR_SET" ]]; then
CMD_STR="${CMD[*]}"
SAFE_CMD=$(echo "$CMD_STR" | tr -c 'A-Za-z0-9' '_' )
OUTDIR="diag_${SAFE_CMD}_$(date +%Y%m%d%H%M%S)"
fi
mkdir -p "$OUTDIR"/{stacks,dumps}
CONSOLE_LOG="$OUTDIR/console.log"
ENV_LOG="$OUTDIR/env.log"
PID_FILE="$OUTDIR/pid"
echo "Diagnostics output: $OUTDIR"
echo "Stack trace interval: ${INTERVAL}s"
echo "Command: ${CMD[*]}"
# --- collect some quick system info ---
{
echo "=== $(date "+%Y-%m-%dT%H:%M:%S%z") ==="
echo "uname -a:"; uname -a
echo
echo "top snapshot:"; top -l 1 | grep -E "^CPU|^Phys" || true
echo
echo "ulimit -a:"; ulimit -a || true
echo
echo "python version:"; python3 -VV || true
echo
} > "$ENV_LOG" 2>&1 || true
# --- download hf-xet dbg symbols ---
WHEEL_VERSION=$(pip show hf-xet | awk '/^Version:/{printf $2}')
if [ -z "$WHEEL_VERSION" ]; then
echo "Error: hf-xet package is not installed. Please install it before running this script." >&2
exit 1
fi
echo "hf-xet wheel version: $WHEEL_VERSION"
SYMBOL_DIR="symbols-$WHEEL_VERSION"
if [ -d "$SYMBOL_DIR" ]; then
echo "Existing symbols dir found, assuming previously installed."
else
SITE_PACKAGES="$(pip show hf-xet | awk '/^Location:/{printf $2}')"
WHEEL_DIR="$SITE_PACKAGES/hf_xet"
DIST_INFO="$SITE_PACKAGES/hf_xet-$WHEEL_VERSION.dist-info"
WHEEL_FILE="$DIST_INFO/WHEEL"
# Reconstruct wheel name from wheel version and wheel tag
WHEEL_TAG=$(awk '/^Tag:/{printf $2}' $WHEEL_FILE)
SYMBOL_FILENAME="hf_xet-$WHEEL_VERSION-$WHEEL_TAG.dylib.dSYM"
echo "Downloading debug symbols: $SYMBOL_FILENAME"
# If the version is of format "1.1.10rc0", change it to our release tag format like "1.1.10-rc0"
RELEASE_TAG=$(echo -n "$WHEEL_VERSION" | sed 's/\([0-9]\)\(rc.*\)$/\1-\2/')
DOWNLOAD_URL="https://github.com/huggingface/xet-core/releases/download/v${RELEASE_TAG}/dbg-symbols.zip"
curl -fL "$DOWNLOAD_URL" -o dbg-symbols.zip
if [ $? -ne 0 ]; then
echo "Error: Failed to download debug symbols from $DOWNLOAD_URL" >&2
exit 1
fi
# Extract just the needed symbol file
unzip dbg-symbols.zip -d "$SYMBOL_DIR"
# Copy to package directory
cp -r "$SYMBOL_DIR/dbg-symbols/$SYMBOL_FILENAME" "$WHEEL_DIR/"
echo "Installed dbg symbol $SYMBOL_FILENAME to $WHEEL_DIR"
fi
# --- launch target ---
SCRIPT_START_TIME=$(date +%s)
REF_FILE="$OUTDIR/.ref_timestamp"
touch "$REF_FILE" # Reference file for finding logs created after this point
# Ensure REF_FILE is cleaned up on exit
trap 'rm -f "$REF_FILE"' EXIT
echo "Launching target at $(date "+%Y-%m-%dT%H:%M:%S%z") ..." | tee -a "$CONSOLE_LOG"
(
"${CMD[@]}" & echo $! > "$PID_FILE"
) 2>&1 | tee -a "$CONSOLE_LOG" &
LOGGER_BG=$!
# read PID
for _ in {1..50}; do
[[ -s "$PID_FILE" ]] && break
sleep 0.1
done
if [[ ! -s "$PID_FILE" ]]; then
echo "ERROR: Could not determine child PID." | tee -a "$CONSOLE_LOG"
exit 1
fi
TARGET_PID="$(cat "$PID_FILE")"
echo "Started PID: $TARGET_PID" | tee -a "$CONSOLE_LOG"
# --- stack capture + hang detection ---
declare -a LAST_STACKS=()
capture_stack() {
local ts stack_file
ts="$(date +%Y%m%d%H%M%S)"
stack_file="$OUTDIR/stacks/stack_${ts}.txt"
sample "$TARGET_PID" 5 -file "$stack_file" || true
echo "$(date "+%Y-%m-%dT%H:%M:%S%z") captured stack -> $stack_file" | tee -a "$CONSOLE_LOG"
LAST_STACKS+=("$stack_file")
if (( ${#LAST_STACKS[@]} > 3 )); then
LAST_STACKS=("${LAST_STACKS[@]: -3}")
fi
check_hang
}
check_hang() {
# need three snapshots to decide
if (( ${#LAST_STACKS[@]} < 3 )); then return; fi
# normalize: strip addresses and empty lines
norm1=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[0]}" | grep -v '^$')
norm2=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[1]}" | grep -v '^$')
norm3=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[2]}" | grep -v '^$')
diff12=$(diff <(echo "$norm1") <(echo "$norm2") || true)
diff23=$(diff <(echo "$norm2") <(echo "$norm3") || true)
# If either diff is non-empty => stacks changed -> NOT a hang
if [[ -n "$diff12" || -n "$diff23" ]]; then
return
fi
# Otherwise both diffs empty => stacks the same across 3 snapshots => HANG
echo "⚠️ Hang detected at $(date "+%Y-%m-%dT%H:%M:%S%z") — taking core dump." | tee -a "$CONSOLE_LOG"
take_core_dump
LAST_STACKS=()
}
take_core_dump() {
local ts core_file
ts="$(date +%Y%m%d%H%M%S)"
core_file="$OUTDIR/dumps/dump_${TARGET_PID}_${ts}.core"
lldb -p "$TARGET_PID" -o "process save-core $core_file" -o "quit" >>"$CONSOLE_LOG" 2>&1 || true
echo "Core dump saved: $core_file" | tee -a "$CONSOLE_LOG"
}
# --- monitoring loop ---
LAST_SNAPSHOT_AT=0
while kill -0 "$TARGET_PID" 2>/dev/null; do
now=$(date +%s)
if (( now - LAST_SNAPSHOT_AT >= INTERVAL )); then
capture_stack || true
LAST_SNAPSHOT_AT=$now
fi
sleep 1
done
echo "Process $TARGET_PID has exited at $(date "+%Y-%m-%dT%H:%M:%S%z")." | tee -a "$CONSOLE_LOG"
# --- collect xet log files from this execution ---
HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"
XET_LOG_DIR="$HF_HOME/xet/logs"
if [[ -d "$XET_LOG_DIR" ]]; then
echo "Collecting xet logs from $XET_LOG_DIR ..." | tee -a "$CONSOLE_LOG"
mkdir -p "$OUTDIR/xet_logs"
# Find log files created after script start using reference file
find "$XET_LOG_DIR" -name "xet_*.log" -type f -newer "$REF_FILE" 2>/dev/null | while read -r logfile; do
cp "$logfile" "$OUTDIR/xet_logs/" 2>/dev/null && \
echo " Copied: $(basename "$logfile")" | tee -a "$CONSOLE_LOG"
done
else
echo "No xet log directory found at $XET_LOG_DIR" | tee -a "$CONSOLE_LOG"
fi
echo "Logs and stacks are in: $OUTDIR"
disown "$LOGGER_BG" 2>/dev/null || true