Skip to content

Commit 20bf6ec

Browse files
chtc: train_grpo.sub/sh + submit wrapper; merge SFT adapter then run verl
1 parent 559d4dc commit 20bf6ec

3 files changed

Lines changed: 223 additions & 0 deletions

File tree

chtc/submit_train_grpo.sh

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
# Submit a verl GRPO training job to CHTC. Run from the head node.
3+
#
4+
# Usage:
5+
# ./submit_train_grpo.sh <run-name> [config-name] [n-gpus] [adapter-name]
6+
#
7+
# run-name identifier for this run (e.g. "grpo-v1-smoke")
8+
# config-name matches configs/<name>.yaml, default: grpo_qwen1_5b
9+
# n-gpus 1 for 1.5B, 4 for 7B. Default: 1.
10+
# adapter-name SFT adapter under results/sft_checkpoints/, default:
11+
# qwen-1.5b-sft-v1. Pass empty string ("") to skip the
12+
# merge and train from base model in the config.
13+
14+
set -euo pipefail
15+
16+
cd "$(dirname "$0")"
17+
18+
# shellcheck disable=SC1091
19+
source .env
20+
21+
RUN_NAME="${1:?usage: $0 <run-name> [config-name] [n-gpus] [adapter-name]}"
22+
CONFIG_NAME="${2:-grpo_qwen1_5b}"
23+
N_GPUS="${3:-1}"
24+
ADAPTER_NAME="${4:-qwen-1.5b-sft-v1}"
25+
26+
RUN_DIR="results/${RUN_NAME}"
27+
LOG_DIR="logs/${RUN_NAME}"
28+
mkdir -p "${RUN_DIR}" "${LOG_DIR}"
29+
30+
cp .env "${RUN_DIR}/.env"
31+
cp train_grpo.sh "${RUN_DIR}/train_grpo.sh"
32+
33+
condor_submit train_grpo.sub \
34+
results_dir="${RUN_DIR}" \
35+
log_dir="${LOG_DIR}" \
36+
config_name="${CONFIG_NAME}" \
37+
n_gpus="${N_GPUS}" \
38+
adapter_name="${ADAPTER_NAME}"
39+
40+
echo ""
41+
echo "==> submitted."
42+
echo " run-name: ${RUN_NAME}"
43+
echo " config: configs/${CONFIG_NAME}.yaml"
44+
echo " n_gpus: ${N_GPUS}"
45+
echo " adapter: ${ADAPTER_NAME:-<none — training from base>}"
46+
echo " watch: condor_q / tail -f $(pwd)/${LOG_DIR}/job.out"
47+
echo " checkpoint: $(pwd)/${RUN_DIR}/checkpoint.tar.gz (after job completes)"

chtc/train_grpo.sh

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#!/bin/bash
2+
# Execute-node script for Week 4 GRPO training via verl.
3+
#
4+
# Steps:
5+
# 1. Standard CHTC env setup (USER, HOME, UUID remap, caches)
6+
# 2. Untar repo from /staging
7+
# 3. pip install our package (verl + deps already in container)
8+
# 4. If an SFT adapter name was passed, merge it into base → starting policy
9+
# 5. Run verl's main_ppo with the config; override model.path if merged
10+
# 6. Tar the final checkpoint dir for transfer back
11+
12+
set -euo pipefail
13+
14+
# shellcheck disable=SC1091
15+
source .env
16+
17+
CONFIG_NAME="${1:-grpo_qwen1_5b}"
18+
ADAPTER_NAME="${2:-}"
19+
20+
# Container has no /etc/passwd entry for the job UID.
21+
export USER="${CHTC_USER:-runner}"
22+
export LOGNAME="${USER}"
23+
24+
INITIAL_PWD=$(pwd)
25+
26+
export HOME=$_CONDOR_SCRATCH_DIR
27+
export HF_HOME=$_CONDOR_SCRATCH_DIR/hf_home
28+
export TRANSFORMERS_CACHE=$HF_HOME/transformers
29+
export HF_DATASETS_CACHE=$HF_HOME/datasets
30+
export HF_MODULES_CACHE=$HF_HOME/modules
31+
export TORCHINDUCTOR_CACHE_DIR=$_CONDOR_SCRATCH_DIR/torch_cache
32+
export XDG_CACHE_HOME=$_CONDOR_SCRATCH_DIR/xdg_cache
33+
export WANDB_DIR=$_CONDOR_SCRATCH_DIR/wandb
34+
export RAY_TMPDIR=/tmp/ray_$USER
35+
36+
# vllm: remap UUID-form CUDA_VISIBLE_DEVICES (multi-GPU keeps integer indexing
37+
# correctly when HTCondor allocates by index; remap only if uuid-form).
38+
if [[ "${CUDA_VISIBLE_DEVICES:-}" =~ ^(GPU-|MIG-) ]]; then
39+
echo "[info] remapping CUDA_VISIBLE_DEVICES='${CUDA_VISIBLE_DEVICES}' -> '0'"
40+
export CUDA_VISIBLE_DEVICES=0
41+
fi
42+
43+
export VLLM_USAGE_DISABLE=1
44+
export OUTLINES_CACHE_DIR=/tmp/.outlines
45+
46+
REPO=verifiable-rl-coder
47+
echo "==> fetching code from /staging/${CHTC_USER}/${REPO}.tar.gz"
48+
cp "/staging/${CHTC_USER}/${REPO}.tar.gz" .
49+
tar -xzf "${REPO}.tar.gz"
50+
rm "${REPO}.tar.gz"
51+
cd "${REPO}"
52+
53+
echo "==> pip install -e .[dev,gpu]"
54+
pip install -e ".[dev,gpu]" --quiet
55+
56+
if [ -n "${HF_TOKEN:-}" ]; then
57+
hf auth login --token "${HF_TOKEN}" || true
58+
fi
59+
if [ -n "${WANDB_API_KEY:-}" ]; then
60+
wandb login --relogin "${WANDB_API_KEY}" || true
61+
fi
62+
63+
# Pre-warm evalplus (used by SubprocessVerifier inside compute_reward).
64+
python -c "from evalplus.data import get_human_eval_plus, get_mbpp_plus; get_human_eval_plus(); get_mbpp_plus()"
65+
66+
# Sanity-check that the GRPO dataset is in the tarball.
67+
DATA_PATH="results/grpo_dataset/v1/train.parquet"
68+
if [ ! -f "${DATA_PATH}" ]; then
69+
echo "ERROR: GRPO dataset not found at ${DATA_PATH}"
70+
echo "Did you run scripts/build_grpo_dataset.py before transfer.sh?"
71+
exit 1
72+
fi
73+
echo "==> dataset: ${DATA_PATH}"
74+
75+
# Always produce a placeholder tarball so HTCondor's transfer never holds
76+
# the job — overwritten on training success.
77+
tar -czf "${INITIAL_PWD}/checkpoint.tar.gz" --files-from /dev/null
78+
79+
# --- Optional: merge SFT adapter into base for warm-start ---
80+
MODEL_OVERRIDE=""
81+
if [ -n "${ADAPTER_NAME}" ]; then
82+
ADAPTER_DIR="results/sft_checkpoints/${ADAPTER_NAME}"
83+
if [ ! -d "${ADAPTER_DIR}" ]; then
84+
echo "ERROR: adapter not found at ${ADAPTER_DIR}"
85+
exit 1
86+
fi
87+
MERGED_DIR="${_CONDOR_SCRATCH_DIR}/merged_${ADAPTER_NAME}"
88+
echo "==> merging adapter -> ${MERGED_DIR}"
89+
python scripts/merge_lora.py --adapter-dir "${ADAPTER_DIR}" --out "${MERGED_DIR}"
90+
MODEL_OVERRIDE="actor_rollout_ref.model.path=${MERGED_DIR}"
91+
echo "==> using merged SFT as GRPO starting policy"
92+
else
93+
echo "==> using base model (no SFT warm-start)"
94+
fi
95+
96+
# --- Launch verl ---
97+
echo "==> python -m verl.trainer.main_ppo --config-path=configs --config-name=${CONFIG_NAME}"
98+
99+
# Don't bail on training failure — we still want the placeholder tarball
100+
# transferred so the job doesn't end up held.
101+
set +e
102+
python -m verl.trainer.main_ppo \
103+
--config-path=configs \
104+
--config-name="${CONFIG_NAME}" \
105+
${MODEL_OVERRIDE}
106+
TRAIN_EXIT=$?
107+
set -e
108+
109+
if [ "${TRAIN_EXIT}" -ne 0 ]; then
110+
echo "ERROR: verl exited ${TRAIN_EXIT}. Placeholder tarball will be transferred."
111+
fi
112+
113+
# --- Tar checkpoint for transfer back ---
114+
# verl writes to default_local_dir from the config. Parse it.
115+
DEFAULT_LOCAL_DIR=$(python -c "
116+
import yaml
117+
with open('configs/${CONFIG_NAME}.yaml') as f:
118+
cfg = yaml.safe_load(f)
119+
print(cfg['trainer']['default_local_dir'])
120+
" 2>/dev/null || echo "")
121+
122+
if [ -n "${DEFAULT_LOCAL_DIR}" ] && [ -d "${DEFAULT_LOCAL_DIR}" ]; then
123+
echo "==> tarring checkpoint dir: ${DEFAULT_LOCAL_DIR}"
124+
tar -czf "${INITIAL_PWD}/checkpoint.tar.gz" \
125+
-C "$(dirname "${DEFAULT_LOCAL_DIR}")" \
126+
"$(basename "${DEFAULT_LOCAL_DIR}")"
127+
ls -lh "${INITIAL_PWD}/checkpoint.tar.gz"
128+
else
129+
echo "WARNING: ${DEFAULT_LOCAL_DIR} not found — placeholder tarball stays."
130+
fi
131+
132+
exit "${TRAIN_EXIT}"

chtc/train_grpo.sub

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# HTCondor submit file for Week 4 GRPO training (verl).
2+
#
3+
# Variables passed by submit_train_grpo.sh via condor_submit:
4+
# config_name e.g. "grpo_qwen1_5b" (matches configs/<name>.yaml)
5+
# n_gpus 1 for 1.5B; 4 for 7B
6+
# adapter_name optional SFT adapter to merge in as starting policy.
7+
# Empty = train from base model in the config.
8+
9+
universe = container
10+
container_image = docker://verlai/verl:vllm012.latest
11+
12+
executable = train_grpo.sh
13+
initial_dir = $(results_dir)
14+
# adapter_name is LAST because it may be empty — HTCondor collapses empty
15+
# middle positionals, see Week 2 lessons.
16+
arguments = $(config_name) $(adapter_name)
17+
18+
log = ../../$(log_dir)/job.log
19+
output = ../../$(log_dir)/job.out
20+
error = ../../$(log_dir)/job.err
21+
22+
stream_output = true
23+
should_transfer_files = YES
24+
when_to_transfer_output = ON_EXIT
25+
transfer_input_files = .env, train_grpo.sh
26+
transfer_output_files = checkpoint.tar.gz
27+
28+
# Compute — GRPO holds rollout + training in memory simultaneously.
29+
request_cpus = 8
30+
request_memory = 96GB
31+
request_disk = 300GB
32+
33+
# GPU — 1 for 1.5B, 4 for 7B (passed via $(n_gpus)).
34+
request_gpus = $(n_gpus)
35+
require_gpus = (GlobalMemoryMb >= 40000) && (Capability >= 8.0)
36+
requirements = (Target.HasCHTCStaging == true) && (Target.CUDADriverVersion >= 12.1)
37+
38+
+WantGPULab = true
39+
+WantFlocking = false
40+
+WantGlidein = false
41+
# "medium" allows up to 24h — full GRPO runs of 500-1000 steps may need it.
42+
+GPUJobLength = "medium"
43+
44+
queue 1

0 commit comments

Comments
 (0)