|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" |
| 4 | +export PORT=8888 |
| 5 | + |
| 6 | +# Local NVMe cache for model weights (set to empty to disable) |
| 7 | +# MI325X nodes have 8x 3.5TB NVMe drives; /local-nvme must be set up |
| 8 | +# via: sudo bash utils/setup_local_nvme.sh /local-nvme |
| 9 | +export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}" |
| 10 | + |
| 11 | +PARTITION="compute" |
| 12 | + |
| 13 | +# Detect benchmark subdir from where the script lives. |
| 14 | +# Multi-node scripts include the framework suffix (e.g. _sglang-disagg.sh); |
| 15 | +# single-node scripts do not (e.g. dsr1_fp8_mi325x.sh). |
| 16 | +SCRIPT_NAME_WITH_FW="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh" |
| 17 | +SCRIPT_NAME_BASE="${EXP_NAME%%_*}_${PRECISION}_mi325x.sh" |
| 18 | +if [[ -f "benchmarks/multi_node/${SCRIPT_NAME_WITH_FW}" ]]; then |
| 19 | + BENCHMARK_SUBDIR="multi_node" |
| 20 | + SCRIPT_NAME="${SCRIPT_NAME_WITH_FW}" |
| 21 | +elif [[ -f "benchmarks/single_node/${SCRIPT_NAME_BASE}" ]]; then |
| 22 | + BENCHMARK_SUBDIR="single_node" |
| 23 | + SCRIPT_NAME="${SCRIPT_NAME_BASE}" |
| 24 | +else |
| 25 | + echo "ERROR: neither benchmarks/multi_node/${SCRIPT_NAME_WITH_FW} nor benchmarks/single_node/${SCRIPT_NAME_BASE} found" |
| 26 | + exit 1 |
| 27 | +fi |
| 28 | + |
| 29 | +# ============================================================================= |
| 30 | +# Multi-node disaggregated path: sbatch + Docker via submit.sh |
| 31 | +# ============================================================================= |
| 32 | +if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then |
| 33 | + |
| 34 | + scancel_sync() { |
| 35 | + local jobid=$1 |
| 36 | + local timeout=${2:-600} |
| 37 | + local interval=10 |
| 38 | + local start |
| 39 | + start=$(date +%s) |
| 40 | + |
| 41 | + echo "[scancel_sync] Requesting cancel of job $jobid" |
| 42 | + scancel "$jobid" || true |
| 43 | + |
| 44 | + while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do |
| 45 | + local now |
| 46 | + now=$(date +%s) |
| 47 | + if (( now - start >= timeout )); then |
| 48 | + echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s" |
| 49 | + return 1 |
| 50 | + fi |
| 51 | + echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..." |
| 52 | + sleep "$interval" |
| 53 | + done |
| 54 | + echo "[scancel_sync] job $jobid exited" |
| 55 | + return 0 |
| 56 | + } |
| 57 | + |
| 58 | + set -x |
| 59 | + |
| 60 | + export SLURM_ACCOUNT="$USER" |
| 61 | + export SLURM_PARTITION="$PARTITION" |
| 62 | + export SLURM_JOB_NAME="benchmark-sglang-disagg.job" |
| 63 | + |
| 64 | + export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}" |
| 65 | + |
| 66 | + # MODEL_YAML_KEY: top-level key in models.yaml for server config lookup. |
| 67 | + if [[ -z "${MODEL_YAML_KEY:-}" ]]; then |
| 68 | + export MODEL_YAML_KEY="${MODEL##*/}" |
| 69 | + fi |
| 70 | + |
| 71 | + # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container. |
| 72 | + # Auto-resolved from HF hub cache layout so no symlink is needed. |
| 73 | + if [[ -z "${MODEL_NAME:-}" ]]; then |
| 74 | + _HF_DIR="models--$(echo "${MODEL}" | sed 's|/|--|g')" |
| 75 | + _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1) |
| 76 | + if [[ -n "${_SNAPSHOT}" ]]; then |
| 77 | + export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}" |
| 78 | + elif [[ -d "${MODEL_PATH}/${MODEL##*/}" ]]; then |
| 79 | + # Cluster stores models as flat dirs named after the repo (e.g. DeepSeek-R1-0528), |
| 80 | + # not in HF hub cache layout. Use repo name so MODEL_YAML_KEY can differ from |
| 81 | + # the path (e.g. DeepSeek-R1-0528-bnxt yaml key → DeepSeek-R1-0528 dir). |
| 82 | + export MODEL_NAME="${MODEL##*/}" |
| 83 | + else |
| 84 | + export MODEL_NAME="${MODEL_YAML_KEY}" |
| 85 | + fi |
| 86 | + fi |
| 87 | + |
| 88 | + export GPUS_PER_NODE=8 |
| 89 | + |
| 90 | + export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" |
| 91 | + mkdir -p "$BENCHMARK_LOGS_DIR" |
| 92 | + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks |
| 93 | + timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true |
| 94 | + |
| 95 | + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") |
| 96 | + |
| 97 | + if [[ -z "$JOB_ID" ]]; then |
| 98 | + echo "ERROR: benchmark script produced no job ID" |
| 99 | + exit 1 |
| 100 | + fi |
| 101 | + |
| 102 | + LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" |
| 103 | + |
| 104 | + sleep 10 |
| 105 | + |
| 106 | + while ! ls "$LOG_FILE" &>/dev/null; do |
| 107 | + if ! squeue -u "$USER" --noheader --format='%i' | grep -qx "$JOB_ID"; then |
| 108 | + echo "ERROR: Job $JOB_ID failed before creating log file" |
| 109 | + scontrol show job "$JOB_ID" |
| 110 | + exit 1 |
| 111 | + fi |
| 112 | + sleep 5 |
| 113 | + done |
| 114 | + |
| 115 | + set +x |
| 116 | + |
| 117 | + ( |
| 118 | + while squeue -u $USER --noheader --format='%i' | grep -qx "$JOB_ID"; do |
| 119 | + sleep 10 |
| 120 | + done |
| 121 | + ) & |
| 122 | + POLL_PID=$! |
| 123 | + |
| 124 | + tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null |
| 125 | + |
| 126 | + wait $POLL_PID |
| 127 | + |
| 128 | + set -x |
| 129 | + |
| 130 | + cat > collect_latest_results.py <<'PY' |
| 131 | +import os, sys |
| 132 | +sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) |
| 133 | +for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: |
| 134 | + print(path) |
| 135 | +PY |
| 136 | + |
| 137 | + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) |
| 138 | + if [ -z "$LOGS_DIR" ]; then |
| 139 | + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" |
| 140 | + exit 1 |
| 141 | + fi |
| 142 | + |
| 143 | + echo "Found logs directory: $LOGS_DIR" |
| 144 | + ls -la "$LOGS_DIR" |
| 145 | + |
| 146 | + for result_file in $(find $LOGS_DIR -type f); do |
| 147 | + file_name=$(basename $result_file) |
| 148 | + if [ -f $result_file ]; then |
| 149 | + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" |
| 150 | + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" |
| 151 | + cp $result_file $WORKSPACE_RESULT_FILE |
| 152 | + fi |
| 153 | + done |
| 154 | + |
| 155 | + echo "All result files processed" |
| 156 | + set +x |
| 157 | + scancel_sync $JOB_ID |
| 158 | + set -x |
| 159 | + echo "Canceled the slurm job $JOB_ID" |
| 160 | + |
| 161 | + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks |
| 162 | + timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true |
| 163 | + |
| 164 | + if [[ -n "${GITHUB_ACTIONS:-}" ]]; then |
| 165 | + ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" |
| 166 | + mkdir -p "$ARTIFACT_DIR" |
| 167 | + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true |
| 168 | + echo "Logs copied to $ARTIFACT_DIR for artifact upload" |
| 169 | + fi |
| 170 | + |
| 171 | +# ============================================================================= |
| 172 | +# Single-node path: enroot via salloc + srun |
| 173 | +# ============================================================================= |
| 174 | +else |
| 175 | + |
| 176 | + SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" |
| 177 | + LOCK_FILE="${SQUASH_FILE}.lock" |
| 178 | + |
| 179 | + set -x |
| 180 | + |
| 181 | + JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') |
| 182 | + |
| 183 | + if [ -z "$JOB_ID" ]; then |
| 184 | + echo "ERROR: salloc failed to allocate a job" |
| 185 | + exit 1 |
| 186 | + fi |
| 187 | + |
| 188 | + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " |
| 189 | + exec 9>\"$LOCK_FILE\" |
| 190 | + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } |
| 191 | + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then |
| 192 | + echo 'Squash file already exists and is valid, skipping import' |
| 193 | + else |
| 194 | + rm -f \"$SQUASH_FILE\" |
| 195 | + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE |
| 196 | + fi |
| 197 | + " |
| 198 | + srun --jobid=$JOB_ID \ |
| 199 | + --container-image=$SQUASH_FILE \ |
| 200 | + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ |
| 201 | + --container-mount-home \ |
| 202 | + --container-writable \ |
| 203 | + --container-remap-root \ |
| 204 | + --container-workdir=/workspace/ \ |
| 205 | + --no-container-entrypoint --export=ALL \ |
| 206 | + bash benchmarks/single_node/${SCRIPT_NAME} |
| 207 | + |
| 208 | + scancel $JOB_ID |
| 209 | + |
| 210 | +fi |
0 commit comments