Skip to content

Commit 59c63bb

Browse files
Minor updates to used config
1 parent a622494 commit 59c63bb

1 file changed

Lines changed: 14 additions & 3 deletions

File tree

apertus/launch/multinode_async_sandbox/launch_incogitans.sh

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,19 @@ if [[ -z "${TOKENIZER_NAME_OR_PATH:-}" ]]; then
3636
fi
3737
CONFIG_NAME="${CONFIG_NAME:-async}"
3838
SLURM_TIME=12:00:00
39-
TRAIN_NNODES=24
40-
ROLLOUT_NNODES=8
39+
TRAIN_NNODES=16
40+
ROLLOUT_NNODES=16
4141
NNODES="${NNODES:-$((TRAIN_NNODES + ROLLOUT_NNODES))}"
4242
TRAINING_DATA_DIR=/capstor/store/cscs/swissai/infra01/reasoning/data/RL-prod/apertus_1p5_incogitans
4343
FORCE_THINKING=false
4444
THINK_PREFIX_TOKEN="<|inner_prefix|>"
4545
ENABLE_THINKING=false
4646
SEED=85
47-
ROLLOUT_N=12
47+
ROLLOUT_N=8
4848
N_PER_ROUND="${N_PER_ROUND:-${ROLLOUT_N}}"
4949
USE_GROUP_FILTERING=true
5050
JOB_NAME=1p5_8b_incogitans
51+
RESUME_RUN_NAME=""
5152
VAL_BEFORE_TRAIN=true
5253

5354
WANDB_ENTITY="${WANDB_ENTITY:-apertus}"
@@ -124,12 +125,18 @@ resolve_run_name_and_dir() {
124125
JOB_NAME="async__${CONFIG_NAME}_${group_filtering_tag}${model_tag}_${TRAIN_NNODES}tn-${ROLLOUT_NNODES}rn__s${SEED}${thinking_tag}"
125126
fi
126127
JOB_NAME="$(sanitize_job_name "${JOB_NAME}")"
128+
if [[ -n "${RESUME_RUN_NAME}" ]]; then
129+
RUN_NAME="${RESUME_RUN_NAME}"
130+
else
127131
RUN_NAME="${JOB_NAME}__$(date +%Y%m%d-%H%M%S)"
132+
fi
128133
RUN_DIR="${WORKING_DIR}/outputs/${PROJECT_NAME}/${RUN_NAME}"
129134
SCHED_JOB_NAME="${JOB_NAME}_sched"
130135
TRAIN_JOB_NAME="${JOB_NAME}_train"
131136

137+
if [[ -z "${RESUME_RUN_NAME}" ]]; then
132138
mkdir -p "${RUN_DIR}"
139+
fi
133140
}
134141

135142
probe_ok() {
@@ -300,6 +307,9 @@ if [[ "${WANDB_BACKGROUND_SYNC}" == "true" ]]; then
300307
else
301308
log " -> output=${RUN_DIR}"
302309
fi
310+
if [[ -n "${RESUME_RUN_NAME}" ]]; then
311+
log " -> resume_run_name=${RESUME_RUN_NAME}"
312+
fi
303313
log " -> qa_gym_reranker_url=${QA_GYM_RERANKER_URL}"
304314
if [[ -n "${URL}" ]]; then
305315
log " -> sandbox_backend=${SANDBOX_BACKEND} sandbox_url=${URL} continuous=${SANDBOX_REWARD_CONTINUOUS}"
@@ -351,6 +361,7 @@ EXPORT_VARS=(
351361
"PROJECT_NAME=${PROJECT_NAME}"
352362
"RUN_NAME=${RUN_NAME}"
353363
"RUN_DIR=${RUN_DIR}"
364+
"RESUME_RUN_NAME=${RESUME_RUN_NAME}"
354365
"WANDB_BACKGROUND_SYNC=${WANDB_BACKGROUND_SYNC}"
355366
"WANDB_ENTITY=${WANDB_ENTITY}"
356367
"WANDB_MODE=${WANDB_MODE}"

0 commit comments

Comments
 (0)