@@ -36,18 +36,19 @@ if [[ -z "${TOKENIZER_NAME_OR_PATH:-}" ]]; then
3636fi
3737CONFIG_NAME=" ${CONFIG_NAME:- async} "
3838SLURM_TIME=12:00:00
39- TRAIN_NNODES=24
40- ROLLOUT_NNODES=8
39+ TRAIN_NNODES=16
40+ ROLLOUT_NNODES=16
4141NNODES=" ${NNODES:- $((TRAIN_NNODES + ROLLOUT_NNODES))} "
4242TRAINING_DATA_DIR=/capstor/store/cscs/swissai/infra01/reasoning/data/RL-prod/apertus_1p5_incogitans
4343FORCE_THINKING=false
4444THINK_PREFIX_TOKEN=" <|inner_prefix|>"
4545ENABLE_THINKING=false
4646SEED=85
47- ROLLOUT_N=12
47+ ROLLOUT_N=8
4848N_PER_ROUND=" ${N_PER_ROUND:- ${ROLLOUT_N} } "
4949USE_GROUP_FILTERING=true
5050JOB_NAME=1p5_8b_incogitans
51+ RESUME_RUN_NAME=" "
5152VAL_BEFORE_TRAIN=true
5253
5354WANDB_ENTITY=" ${WANDB_ENTITY:- apertus} "
@@ -124,12 +125,18 @@ resolve_run_name_and_dir() {
124125 JOB_NAME=" async__${CONFIG_NAME} _${group_filtering_tag}${model_tag} _${TRAIN_NNODES} tn-${ROLLOUT_NNODES} rn__s${SEED}${thinking_tag} "
125126 fi
126127 JOB_NAME=" $( sanitize_job_name " ${JOB_NAME} " ) "
128+ if [[ -n " ${RESUME_RUN_NAME} " ]]; then
129+ RUN_NAME=" ${RESUME_RUN_NAME} "
130+ else
127131 RUN_NAME=" ${JOB_NAME} __$( date +%Y%m%d-%H%M%S) "
132+ fi
128133 RUN_DIR=" ${WORKING_DIR} /outputs/${PROJECT_NAME} /${RUN_NAME} "
129134 SCHED_JOB_NAME=" ${JOB_NAME} _sched"
130135 TRAIN_JOB_NAME=" ${JOB_NAME} _train"
131136
137+ if [[ -z " ${RESUME_RUN_NAME} " ]]; then
132138 mkdir -p " ${RUN_DIR} "
139+ fi
133140}
134141
135142probe_ok () {
@@ -300,6 +307,9 @@ if [[ "${WANDB_BACKGROUND_SYNC}" == "true" ]]; then
300307else
301308 log " -> output=${RUN_DIR} "
302309fi
310+ if [[ -n " ${RESUME_RUN_NAME} " ]]; then
311+ log " -> resume_run_name=${RESUME_RUN_NAME} "
312+ fi
303313log " -> qa_gym_reranker_url=${QA_GYM_RERANKER_URL} "
304314if [[ -n " ${URL} " ]]; then
305315 log " -> sandbox_backend=${SANDBOX_BACKEND} sandbox_url=${URL} continuous=${SANDBOX_REWARD_CONTINUOUS} "
@@ -351,6 +361,7 @@ EXPORT_VARS=(
351361 " PROJECT_NAME=${PROJECT_NAME} "
352362 " RUN_NAME=${RUN_NAME} "
353363 " RUN_DIR=${RUN_DIR} "
364+ " RESUME_RUN_NAME=${RESUME_RUN_NAME} "
354365 " WANDB_BACKGROUND_SYNC=${WANDB_BACKGROUND_SYNC} "
355366 " WANDB_ENTITY=${WANDB_ENTITY} "
356367 " WANDB_MODE=${WANDB_MODE} "
0 commit comments