-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlaunch_search.sh
More file actions
72 lines (65 loc) · 2.25 KB
/
Copy pathlaunch_search.sh
File metadata and controls
72 lines (65 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash -l
#
# SLURM CONFIG -----------------------------------------------------------------
#SBATCH --job-name=random-search
#SBATCH --time=1-00:00:00
#SBATCH --array=-0-2
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gres=gpu:1
#SBATCH --mem=150G
#SBATCH -p mldlc2_gpu-h200 # Partition
#SBATCH --requeue # Requeue if preempted
# Tip: %A = array job ID, %a = task ID, %j = single-job ID, %x = job name
#SBATCH --output=logs/%x_%A_%a_%j.out
#SBATCH --error=logs/%x_%A_%a_%j.err
# -----------------------------------------------------------------------------
set -euo pipefail
mkdir -p logs
# Optional knobs (override via env, e.g. MODEL_ID=... sbatch this_script.sh)
MODEL_ID="${MODEL_ID:-Qwen/Qwen3-8B-Base}"
SEQ_LEN="${SEQ_LEN:-512}"
BATCH_SIZE="${BATCH_SIZE:-2}"
MAX_EPOCHS="${MAX_EPOCHS:-100}"
NUM_BINS="${NUM_BINS:-20}"
LW="${LW:-}" # e.g., '--label_smoothing 0.1'
# Make SLURM_ARRAY_TASK_ID robust even when run outside an array or with set -u
# Priority: SLURM_ARRAY_TASK_ID (if set) > TASK_ID env > first positional arg > 0
: "${SLURM_ARRAY_TASK_ID:=${TASK_ID:-${1:-0}}}"
# Random, likely-free master port
MASTER_PORT=$(python - <<'PY'
import random, socket
for _ in range(200):
port = random.randint(20000, 65000)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("", port))
except OSError:
continue
print(port)
break
else:
print(0)
PY
)
# Validate port
if [[ -z "${MASTER_PORT}" || ! "${MASTER_PORT}" =~ ^[0-9]+$ || "${MASTER_PORT}" -eq 0 ]]; then
echo "Error: MASTER_PORT is invalid or empty. Exiting."
exit 1
fi
echo "Using master port: ${MASTER_PORT}"
echo "SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID}"
python -m torch.distributed.launch \
--nproc_per_node=1 \
--master_port="${MASTER_PORT}" \
--use_env \
search/evo/evo_search_coarse_importance_sorted.py \
--seq_len 512 \
--batch-size 2 \
--model_id Qwen/Qwen3-8B-Base \
--sorted_indices_path sorted_ids_norm-mean_block_importance.pkl \
--model_path permuted_model_norm-mean_block_importance.pth \
--data-path /work/dlclarge2/sukthank-whittle/dense-lotteries/dataloaders/wikitext/ \
--resume_evo "" \
--bin_to_search "${SLURM_ARRAY_TASK_ID}" \
--max-epochs 100