|
| 1 | +#!/bin/bash |
| 2 | +# Execute-node script for Week 4 GRPO training via verl. |
| 3 | +# |
| 4 | +# Steps: |
| 5 | +# 1. Standard CHTC env setup (USER, HOME, UUID remap, caches) |
| 6 | +# 2. Untar repo from /staging |
| 7 | +# 3. pip install our package (verl + deps already in container) |
| 8 | +# 4. If an SFT adapter name was passed, merge it into base → starting policy |
| 9 | +# 5. Run verl's main_ppo with the config; override model.path if merged |
| 10 | +# 6. Tar the final checkpoint dir for transfer back |
| 11 | + |
| 12 | +set -euo pipefail |
| 13 | + |
| 14 | +# shellcheck disable=SC1091 |
| 15 | +source .env |
| 16 | + |
| 17 | +CONFIG_NAME="${1:-grpo_qwen1_5b}" |
| 18 | +ADAPTER_NAME="${2:-}" |
| 19 | + |
| 20 | +# Container has no /etc/passwd entry for the job UID. |
| 21 | +export USER="${CHTC_USER:-runner}" |
| 22 | +export LOGNAME="${USER}" |
| 23 | + |
| 24 | +INITIAL_PWD=$(pwd) |
| 25 | + |
| 26 | +export HOME=$_CONDOR_SCRATCH_DIR |
| 27 | +export HF_HOME=$_CONDOR_SCRATCH_DIR/hf_home |
| 28 | +export TRANSFORMERS_CACHE=$HF_HOME/transformers |
| 29 | +export HF_DATASETS_CACHE=$HF_HOME/datasets |
| 30 | +export HF_MODULES_CACHE=$HF_HOME/modules |
| 31 | +export TORCHINDUCTOR_CACHE_DIR=$_CONDOR_SCRATCH_DIR/torch_cache |
| 32 | +export XDG_CACHE_HOME=$_CONDOR_SCRATCH_DIR/xdg_cache |
| 33 | +export WANDB_DIR=$_CONDOR_SCRATCH_DIR/wandb |
| 34 | +export RAY_TMPDIR=/tmp/ray_$USER |
| 35 | + |
| 36 | +# vllm: remap UUID-form CUDA_VISIBLE_DEVICES (multi-GPU keeps integer indexing |
| 37 | +# correctly when HTCondor allocates by index; remap only if uuid-form). |
| 38 | +if [[ "${CUDA_VISIBLE_DEVICES:-}" =~ ^(GPU-|MIG-) ]]; then |
| 39 | + echo "[info] remapping CUDA_VISIBLE_DEVICES='${CUDA_VISIBLE_DEVICES}' -> '0'" |
| 40 | + export CUDA_VISIBLE_DEVICES=0 |
| 41 | +fi |
| 42 | + |
| 43 | +export VLLM_USAGE_DISABLE=1 |
| 44 | +export OUTLINES_CACHE_DIR=/tmp/.outlines |
| 45 | + |
| 46 | +REPO=verifiable-rl-coder |
| 47 | +echo "==> fetching code from /staging/${CHTC_USER}/${REPO}.tar.gz" |
| 48 | +cp "/staging/${CHTC_USER}/${REPO}.tar.gz" . |
| 49 | +tar -xzf "${REPO}.tar.gz" |
| 50 | +rm "${REPO}.tar.gz" |
| 51 | +cd "${REPO}" |
| 52 | + |
| 53 | +echo "==> pip install -e .[dev,gpu]" |
| 54 | +pip install -e ".[dev,gpu]" --quiet |
| 55 | + |
| 56 | +if [ -n "${HF_TOKEN:-}" ]; then |
| 57 | + hf auth login --token "${HF_TOKEN}" || true |
| 58 | +fi |
| 59 | +if [ -n "${WANDB_API_KEY:-}" ]; then |
| 60 | + wandb login --relogin "${WANDB_API_KEY}" || true |
| 61 | +fi |
| 62 | + |
| 63 | +# Pre-warm evalplus (used by SubprocessVerifier inside compute_reward). |
| 64 | +python -c "from evalplus.data import get_human_eval_plus, get_mbpp_plus; get_human_eval_plus(); get_mbpp_plus()" |
| 65 | + |
| 66 | +# Sanity-check that the GRPO dataset is in the tarball. |
| 67 | +DATA_PATH="results/grpo_dataset/v1/train.parquet" |
| 68 | +if [ ! -f "${DATA_PATH}" ]; then |
| 69 | + echo "ERROR: GRPO dataset not found at ${DATA_PATH}" |
| 70 | + echo "Did you run scripts/build_grpo_dataset.py before transfer.sh?" |
| 71 | + exit 1 |
| 72 | +fi |
| 73 | +echo "==> dataset: ${DATA_PATH}" |
| 74 | + |
| 75 | +# Always produce a placeholder tarball so HTCondor's transfer never holds |
| 76 | +# the job — overwritten on training success. |
| 77 | +tar -czf "${INITIAL_PWD}/checkpoint.tar.gz" --files-from /dev/null |
| 78 | + |
| 79 | +# --- Optional: merge SFT adapter into base for warm-start --- |
| 80 | +MODEL_OVERRIDE="" |
| 81 | +if [ -n "${ADAPTER_NAME}" ]; then |
| 82 | + ADAPTER_DIR="results/sft_checkpoints/${ADAPTER_NAME}" |
| 83 | + if [ ! -d "${ADAPTER_DIR}" ]; then |
| 84 | + echo "ERROR: adapter not found at ${ADAPTER_DIR}" |
| 85 | + exit 1 |
| 86 | + fi |
| 87 | + MERGED_DIR="${_CONDOR_SCRATCH_DIR}/merged_${ADAPTER_NAME}" |
| 88 | + echo "==> merging adapter -> ${MERGED_DIR}" |
| 89 | + python scripts/merge_lora.py --adapter-dir "${ADAPTER_DIR}" --out "${MERGED_DIR}" |
| 90 | + MODEL_OVERRIDE="actor_rollout_ref.model.path=${MERGED_DIR}" |
| 91 | + echo "==> using merged SFT as GRPO starting policy" |
| 92 | +else |
| 93 | + echo "==> using base model (no SFT warm-start)" |
| 94 | +fi |
| 95 | + |
| 96 | +# --- Launch verl --- |
| 97 | +echo "==> python -m verl.trainer.main_ppo --config-path=configs --config-name=${CONFIG_NAME}" |
| 98 | + |
| 99 | +# Don't bail on training failure — we still want the placeholder tarball |
| 100 | +# transferred so the job doesn't end up held. |
| 101 | +set +e |
| 102 | +python -m verl.trainer.main_ppo \ |
| 103 | + --config-path=configs \ |
| 104 | + --config-name="${CONFIG_NAME}" \ |
| 105 | + ${MODEL_OVERRIDE} |
| 106 | +TRAIN_EXIT=$? |
| 107 | +set -e |
| 108 | + |
| 109 | +if [ "${TRAIN_EXIT}" -ne 0 ]; then |
| 110 | + echo "ERROR: verl exited ${TRAIN_EXIT}. Placeholder tarball will be transferred." |
| 111 | +fi |
| 112 | + |
| 113 | +# --- Tar checkpoint for transfer back --- |
| 114 | +# verl writes to default_local_dir from the config. Parse it. |
| 115 | +DEFAULT_LOCAL_DIR=$(python -c " |
| 116 | +import yaml |
| 117 | +with open('configs/${CONFIG_NAME}.yaml') as f: |
| 118 | + cfg = yaml.safe_load(f) |
| 119 | +print(cfg['trainer']['default_local_dir']) |
| 120 | +" 2>/dev/null || echo "") |
| 121 | + |
| 122 | +if [ -n "${DEFAULT_LOCAL_DIR}" ] && [ -d "${DEFAULT_LOCAL_DIR}" ]; then |
| 123 | + echo "==> tarring checkpoint dir: ${DEFAULT_LOCAL_DIR}" |
| 124 | + tar -czf "${INITIAL_PWD}/checkpoint.tar.gz" \ |
| 125 | + -C "$(dirname "${DEFAULT_LOCAL_DIR}")" \ |
| 126 | + "$(basename "${DEFAULT_LOCAL_DIR}")" |
| 127 | + ls -lh "${INITIAL_PWD}/checkpoint.tar.gz" |
| 128 | +else |
| 129 | + echo "WARNING: ${DEFAULT_LOCAL_DIR} not found — placeholder tarball stays." |
| 130 | +fi |
| 131 | + |
| 132 | +exit "${TRAIN_EXIT}" |
0 commit comments