Skip to content

Commit 03015e0

Browse files
JordanNanosclaude
andcommitted
Update MI325X runners to new amds naming convention
Old mi325x-amd_* runners are offline. New mi325x-amds_* runners (00-08) are online. Update runners.yaml and add launch_mi325x-amds.sh (copy of launch_mi325x-amd.sh) for the new runner name pattern. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e661747 commit 03015e0

File tree

2 files changed

+222
-8
lines changed

2 files changed

+222
-8
lines changed

.github/configs/runners.yaml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,19 @@ mi300x:
7171
- 'mi300x-amds_2'
7272
- 'mi300x-amds_3'
7373
mi325x:
74-
- 'mi325x-amd_0'
75-
- 'mi325x-amd_1'
76-
- 'mi325x-amd_2'
77-
- 'mi325x-amd_3'
74+
- 'mi325x-amds_00'
75+
- 'mi325x-amds_01'
76+
- 'mi325x-amds_02'
77+
- 'mi325x-amds_03'
78+
- 'mi325x-amds_04'
79+
- 'mi325x-amds_05'
80+
- 'mi325x-amds_06'
81+
- 'mi325x-amds_08'
7882
mi325x-disagg:
79-
- 'mi325x-amd_0'
80-
- 'mi325x-amd_1'
81-
- 'mi325x-amd_2'
82-
- 'mi325x-amd_3'
83+
- 'mi325x-amds_00'
84+
- 'mi325x-amds_01'
85+
- 'mi325x-amds_02'
86+
- 'mi325x-amds_03'
8387
mi355x:
8488
- 'mi355x-amds_0'
8589
- 'mi355x-amds_1'

runners/launch_mi325x-amds.sh

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
#!/usr/bin/env bash
2+
3+
export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
4+
export PORT=8888
5+
6+
# Local NVMe cache for model weights (set to empty to disable)
7+
# MI325X nodes have 8x 3.5TB NVMe drives; /local-nvme must be set up
8+
# via: sudo bash utils/setup_local_nvme.sh /local-nvme
9+
export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}"
10+
11+
PARTITION="compute"
12+
13+
# Detect benchmark subdir from where the script lives.
14+
# Multi-node scripts include the framework suffix (e.g. _sglang-disagg.sh);
15+
# single-node scripts do not (e.g. dsr1_fp8_mi325x.sh).
16+
SCRIPT_NAME_WITH_FW="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh"
17+
SCRIPT_NAME_BASE="${EXP_NAME%%_*}_${PRECISION}_mi325x.sh"
18+
if [[ -f "benchmarks/multi_node/${SCRIPT_NAME_WITH_FW}" ]]; then
19+
BENCHMARK_SUBDIR="multi_node"
20+
SCRIPT_NAME="${SCRIPT_NAME_WITH_FW}"
21+
elif [[ -f "benchmarks/single_node/${SCRIPT_NAME_BASE}" ]]; then
22+
BENCHMARK_SUBDIR="single_node"
23+
SCRIPT_NAME="${SCRIPT_NAME_BASE}"
24+
else
25+
echo "ERROR: neither benchmarks/multi_node/${SCRIPT_NAME_WITH_FW} nor benchmarks/single_node/${SCRIPT_NAME_BASE} found"
26+
exit 1
27+
fi
28+
29+
# =============================================================================
30+
# Multi-node disaggregated path: sbatch + Docker via submit.sh
31+
# =============================================================================
32+
if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
33+
34+
scancel_sync() {
35+
local jobid=$1
36+
local timeout=${2:-600}
37+
local interval=10
38+
local start
39+
start=$(date +%s)
40+
41+
echo "[scancel_sync] Requesting cancel of job $jobid"
42+
scancel "$jobid" || true
43+
44+
while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do
45+
local now
46+
now=$(date +%s)
47+
if (( now - start >= timeout )); then
48+
echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s"
49+
return 1
50+
fi
51+
echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..."
52+
sleep "$interval"
53+
done
54+
echo "[scancel_sync] job $jobid exited"
55+
return 0
56+
}
57+
58+
set -x
59+
60+
export SLURM_ACCOUNT="$USER"
61+
export SLURM_PARTITION="$PARTITION"
62+
export SLURM_JOB_NAME="benchmark-sglang-disagg.job"
63+
64+
export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}"
65+
66+
# MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
67+
if [[ -z "${MODEL_YAML_KEY:-}" ]]; then
68+
export MODEL_YAML_KEY="${MODEL##*/}"
69+
fi
70+
71+
# MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container.
72+
# Auto-resolved from HF hub cache layout so no symlink is needed.
73+
if [[ -z "${MODEL_NAME:-}" ]]; then
74+
_HF_DIR="models--$(echo "${MODEL}" | sed 's|/|--|g')"
75+
_SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
76+
if [[ -n "${_SNAPSHOT}" ]]; then
77+
export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
78+
elif [[ -d "${MODEL_PATH}/${MODEL##*/}" ]]; then
79+
# Cluster stores models as flat dirs named after the repo (e.g. DeepSeek-R1-0528),
80+
# not in HF hub cache layout. Use repo name so MODEL_YAML_KEY can differ from
81+
# the path (e.g. DeepSeek-R1-0528-bnxt yaml key → DeepSeek-R1-0528 dir).
82+
export MODEL_NAME="${MODEL##*/}"
83+
else
84+
export MODEL_NAME="${MODEL_YAML_KEY}"
85+
fi
86+
fi
87+
88+
export GPUS_PER_NODE=8
89+
90+
export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
91+
mkdir -p "$BENCHMARK_LOGS_DIR"
92+
# NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
93+
timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
94+
95+
JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
96+
97+
if [[ -z "$JOB_ID" ]]; then
98+
echo "ERROR: benchmark script produced no job ID"
99+
exit 1
100+
fi
101+
102+
LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"
103+
104+
sleep 10
105+
106+
while ! ls "$LOG_FILE" &>/dev/null; do
107+
if ! squeue -u "$USER" --noheader --format='%i' | grep -qx "$JOB_ID"; then
108+
echo "ERROR: Job $JOB_ID failed before creating log file"
109+
scontrol show job "$JOB_ID"
110+
exit 1
111+
fi
112+
sleep 5
113+
done
114+
115+
set +x
116+
117+
(
118+
while squeue -u $USER --noheader --format='%i' | grep -qx "$JOB_ID"; do
119+
sleep 10
120+
done
121+
) &
122+
POLL_PID=$!
123+
124+
tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
125+
126+
wait $POLL_PID
127+
128+
set -x
129+
130+
cat > collect_latest_results.py <<'PY'
131+
import os, sys
132+
sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
133+
for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
134+
print(path)
135+
PY
136+
137+
LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
138+
if [ -z "$LOGS_DIR" ]; then
139+
echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
140+
exit 1
141+
fi
142+
143+
echo "Found logs directory: $LOGS_DIR"
144+
ls -la "$LOGS_DIR"
145+
146+
for result_file in $(find $LOGS_DIR -type f); do
147+
file_name=$(basename $result_file)
148+
if [ -f $result_file ]; then
149+
WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
150+
echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
151+
cp $result_file $WORKSPACE_RESULT_FILE
152+
fi
153+
done
154+
155+
echo "All result files processed"
156+
set +x
157+
scancel_sync $JOB_ID
158+
set -x
159+
echo "Canceled the slurm job $JOB_ID"
160+
161+
# NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
162+
timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
163+
164+
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
165+
ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"
166+
mkdir -p "$ARTIFACT_DIR"
167+
cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true
168+
echo "Logs copied to $ARTIFACT_DIR for artifact upload"
169+
fi
170+
171+
# =============================================================================
172+
# Single-node path: enroot via salloc + srun
173+
# =============================================================================
174+
else
175+
176+
SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
177+
LOCK_FILE="${SQUASH_FILE}.lock"
178+
179+
set -x
180+
181+
JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
182+
183+
if [ -z "$JOB_ID" ]; then
184+
echo "ERROR: salloc failed to allocate a job"
185+
exit 1
186+
fi
187+
188+
srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
189+
exec 9>\"$LOCK_FILE\"
190+
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
191+
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
192+
echo 'Squash file already exists and is valid, skipping import'
193+
else
194+
rm -f \"$SQUASH_FILE\"
195+
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
196+
fi
197+
"
198+
srun --jobid=$JOB_ID \
199+
--container-image=$SQUASH_FILE \
200+
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
201+
--container-mount-home \
202+
--container-writable \
203+
--container-remap-root \
204+
--container-workdir=/workspace/ \
205+
--no-container-entrypoint --export=ALL \
206+
bash benchmarks/single_node/${SCRIPT_NAME}
207+
208+
scancel $JOB_ID
209+
210+
fi

0 commit comments

Comments
 (0)