-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathatom_test.sh
More file actions
executable file
·217 lines (186 loc) · 7.5 KB
/
atom_test.sh
File metadata and controls
executable file
·217 lines (186 loc) · 7.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/bin/bash
#############################################
# GPU Load Test for DeepSeek-R1 Model
# Clean version with improved table output
#############################################
MODEL_NAME="deepseek-ai/DeepSeek-R1-0528"
MODEL_LOCAL_PATH="/models/deepseek-ai/DeepSeek-R1-0528"
TENSOR_PARALLEL=8
KV_CACHE_DTYPE="fp8"
TEMPERATURE=0
LOG_FILE="/tmp/gpu_load_test_$(hostname)_$(date +%Y%m%d_%H%M%S).log"
echo "========================================="
echo "GPU Load Test - DeepSeek-R1"
echo "========================================="
echo "Hostname: $(hostname)"
echo "Date: $(date)"
echo "Log: $LOG_FILE"
echo ""
# Check if model exists locally
if [ -f "$MODEL_LOCAL_PATH/config.json" ]; then
echo "✅ Found model at: $MODEL_LOCAL_PATH"
MODEL_PATH="$MODEL_LOCAL_PATH"
else
echo "⚠️ Model not found locally, will download: $MODEL_NAME"
MODEL_PATH="$MODEL_NAME"
fi
echo ""
# GPU Status
if command -v rocm-smi &> /dev/null; then
GPU_COUNT=$(rocm-smi --showid 2>/dev/null | grep -c 'GPU' || echo '8')
echo "GPU Count: $GPU_COUNT"
echo ""
fi
# Remove existing container
docker rm -f atom_inference 2>/dev/null
# Run the test
echo "Starting model load test..."
echo ""
docker run \
--name atom_inference \
--network=host \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
-v /data:/data \
-e HF_HOME=/data/huggingface_cache \
-e NCCL_DEBUG=WARN \
-e RCCL_DEBUG=WARN \
--shm-size=16G \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
rocm/atom-dev:latest \
bash -c '
MODEL_RUNNER="/app/ATOM/atom/model_engine/model_runner.py"
# Add timing instrumentation
if ! grep -q "^import time$" "$MODEL_RUNNER"; then
sed -i "1a import time" "$MODEL_RUNNER"
fi
# Instrument model loading
sed -i "/load_model(self.model, config.model, config.hf_config, config.load_dummy)/i\\
load_start_time = time.time()\\
logger.info(f\"[LOAD_START] GPU {self.rank} | Time: {load_start_time:.6f}\")" \
"$MODEL_RUNNER"
sed -i "/load_model(self.model, config.model, config.hf_config, config.load_dummy)/a\\
load_elapsed = time.time() - load_start_time\\
logger.info(f\"[LOAD_DONE] GPU {self.rank} | Duration: {load_elapsed:.2f}s\")" \
"$MODEL_RUNNER"
# Run inference
python3 -m atom.examples.simple_inference \
--model "'"$MODEL_PATH"'" \
--kv_cache_dtype "'"$KV_CACHE_DTYPE"'" \
-tp "'"$TENSOR_PARALLEL"'" \
--temperature "'"$TEMPERATURE"'"
' 2>&1 | tee "$LOG_FILE"
# Analyze results
echo ""
echo "========================================="
echo "GPU LOAD TIME ANALYSIS"
echo "========================================="
echo ""
# Check if test completed
LOAD_COUNT=$(grep -c "\[LOAD_DONE\]" "$LOG_FILE" 2>/dev/null || echo 0)
if [ "$LOAD_COUNT" -eq 0 ]; then
echo "❌ Test failed - no GPU load completion found"
echo "Check log: $LOG_FILE"
exit 1
fi
# Extract GPU load times into array
declare -A gpu_times
while IFS= read -r line; do
# Extract GPU number and duration
gpu_num=$(echo "$line" | grep -oP 'GPU \K\d+')
duration=$(echo "$line" | grep -oP 'Duration: \K[0-9.]+')
if [ -n "$gpu_num" ] && [ -n "$duration" ]; then
gpu_times[$gpu_num]=$duration
fi
done < <(grep "\[LOAD_DONE\]" "$LOG_FILE")
# Find min and max times
min_time=""
max_time=""
for gpu in "${!gpu_times[@]}"; do
time=${gpu_times[$gpu]}
if [ -z "$min_time" ] || (( $(awk -v t="$time" -v m="$min_time" 'BEGIN {print (t < m)}') )); then
min_time=$time
fi
if [ -z "$max_time" ] || (( $(awk -v t="$time" -v m="$max_time" 'BEGIN {print (t > m)}') )); then
max_time=$time
fi
done
# Sort GPUs by load time
sorted_gpus=$(for gpu in "${!gpu_times[@]}"; do
echo "$gpu ${gpu_times[$gpu]}"
done | sort -k2 -n)
# Print table header
echo "┌───────┬───────────┬──────────────┬──────────────────────┐"
echo "│ GPU │ Load Time │ Status │ Delta from Fastest │"
echo "├───────┼───────────┼──────────────┼──────────────────────┤"
# Print each GPU
fastest_printed=false
while IFS= read -r line; do
gpu=$(echo "$line" | awk '{print $1}')
time=$(echo "$line" | awk '{print $2}')
# Calculate delta
delta=$(awk -v t="$time" -v m="$min_time" 'BEGIN {printf "%.2f", t - m}')
# Determine status
if [ "$fastest_printed" = false ]; then
status="✅ Fastest"
delta_str="-"
fastest_printed=true
elif (( $(awk -v d="$delta" 'BEGIN {print (d < 1)}') )); then
status="✅ Excellent"
delta_str=$(printf "+%.2fs" "$delta")
elif (( $(awk -v d="$delta" 'BEGIN {print (d < 5)}') )); then
status="✅ Good"
delta_str=$(printf "+%.2fs" "$delta")
elif (( $(awk -v d="$delta" 'BEGIN {print (d < 10)}') )); then
status="⚠️ Moderate"
delta_str=$(printf "+%.2fs" "$delta")
elif (( $(awk -v d="$delta" 'BEGIN {print (d < 100)}') )); then
status="❌ SLOW"
multiplier=$(awk -v t="$time" -v m="$min_time" 'BEGIN {printf "%.1f", t / m}')
delta_str=$(printf "+%.0fs (%.1fx slower!)" "$delta" "$multiplier")
else
status="❌ VERY SLOW"
multiplier=$(awk -v t="$time" -v m="$min_time" 'BEGIN {printf "%.1f", t / m}')
delta_str=$(printf "+%.0fs (%.1fx slower!)" "$delta" "$multiplier")
fi
printf "│ GPU %1s │ %-9s │ %-12s │ %-20s │\n" "$gpu" "${time}s" "$status" "$delta_str"
echo "├───────┼───────────┼──────────────┼──────────────────────┤"
done <<< "$sorted_gpus"
# Table footer
echo "└───────┴───────────┴──────────────┴──────────────────────┘"
echo ""
# Summary statistics
avg_time=$(awk 'BEGIN {sum=0; count=0} {sum+=$2; count++} END {if(count>0) printf "%.2f", sum/count}' <<< "$sorted_gpus")
delta_range=$(awk -v max="$max_time" -v min="$min_time" 'BEGIN {printf "%.2f", max - min}')
variance=$(awk -v max="$max_time" -v min="$min_time" -v avg="$avg_time" 'BEGIN {if(avg>0) printf "%.2f", ((max - min) / avg) * 100; else print "0"}')
echo "Summary:"
echo " GPUs Tested: $LOAD_COUNT"
echo " Fastest: ${min_time}s"
echo " Slowest: ${max_time}s"
echo " Average: ${avg_time}s"
echo " Delta: ${delta_range}s"
echo " Variance: ${variance}%"
echo ""
# Overall assessment
if (( $(awk -v d="$delta_range" 'BEGIN {print (d < 1)}') )); then
echo "✅ EXCELLENT - All GPUs load within 1s of each other"
elif (( $(awk -v d="$delta_range" 'BEGIN {print (d < 5)}') )); then
echo "✅ GOOD - GPUs load within 5s variance"
elif (( $(awk -v d="$delta_range" 'BEGIN {print (d < 10)}') )); then
echo "⚠️ MODERATE - Some variance detected (${delta_range}s)"
else
echo "❌ HIGH VARIANCE - Investigate slow GPUs (${delta_range}s difference)"
echo ""
echo "Recommended actions:"
echo " 1. Check which GPUs are slow (see table above)"
echo " 2. Run test again to verify consistency"
echo " 3. Check firmware versions: rocm-smi --showfw"
echo " 4. Check PCIe links: lspci -vv | grep LnkSta"
fi
echo ""
echo "Full log: $LOG_FILE"
echo "========================================="