mit-han-lab
diff --git a/‎README.md‎
Lines changed: 125 additions & 215 deletions b/‎README.md‎
Lines changed: 125 additions & 215 deletions
diff --git a/‎figures/high_staleness.png‎
245 KB b/‎figures/high_staleness.png‎
245 KB
diff --git a/‎figures/vcpo_multiturn.png‎
383 KB b/‎figures/vcpo_multiturn.png‎
383 KB
diff --git a/‎figures/vcpo_results.png‎
1.11 MB b/‎figures/vcpo_results.png‎
1.11 MB
diff --git a/‎recipe/fully_async_policy/shell/vcpo/gsm8k/synchronous.sh‎
Lines changed: 227 additions & 0 deletions b/‎recipe/fully_async_policy/shell/vcpo/gsm8k/synchronous.sh‎
Lines changed: 227 additions & 0 deletions
@@ -0,0 +1,227 @@
+#!/usr/bin/env bash
+#SBATCH --gpus-per-node=8
+#SBATCH --cpus-per-task=128
+#SBATCH --exclusive
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=./slurm/%A_%x.out
+#SBATCH --error=./slurm/%A_%x.err
+#SBATCH --job-name=vcpo
+
+set -xeuo pipefail
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export RAY_DISABLE_IMPORT_WARNING=1
+export VLLM_USE_V1=1
+export RAY_ADDRESS="local"
+
+# ================= Paths =================
+MODEL_PATH=${MODEL_PATH:-"models/Qwen2-1.5B"}
+TRAIN_FILE=${TRAIN_FILE:-"data/gsm8k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"data/gsm8k/test.parquet"}
+
+project_name='vcpo'
+
+# ================= GPU Layout =================
+NNODES=${NNODES:-1}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+# ================= Rollout =================
+rollout_mode="async"
+rollout_name="vllm"
+return_raw_chat="True"
+gen_tp=2
+n_resp_per_prompt=8
+gpu_memory_utilization=0.9
+enable_chunked_prefill=False
+calculate_log_probs=True
+
+# ================= Sequence Lengths =================
+max_prompt_length=2048
+max_response_length=2048
+max_num_batched_tokens=$((max_prompt_length + max_response_length))
+
+# ================= Megatron Parallelism =================
+train_tp=2
+train_pp=1
+train_cp=1
+sequence_parallel=True
+use_remove_padding=True
+precision_dtype="bfloat16"
+
+# ================= Batch Sizes =================
+train_prompt_bsz=0
+gen_prompt_bsz=1
+train_prompt_mini_bsz=8
+micro_bsz_per_gpu=1
+use_dynamic_bsz=False
+log_prob_micro_bsz_per_gpu=1
+
+# ================= Algorithm =================
+adv_estimator=grpo
+loss_agg_mode="seq-mean-token-mean"
+clip_ratio_low=1.0
+clip_ratio_high=1e9
+clip_ratio_c=1e9
+use_kl_loss=False
+kl_loss_coef=0.0
+use_kl_in_reward=False
+kl_coef=0.0
+entropy_coeff=0
+grad_clip=1.0
+
+# ================= Optimizer =================
+lr=1e-6
+lr_warmup_steps=0
+weight_decay=0.1
+
+# ================= IS / Rollout Correction =================
+rollout_is="sequence"
+rollout_is_threshold="8.0"
+rollout_rs=null
+rollout_rs_threshold=null
+
+# ================= Synchronous Training =================
+staleness_threshold=0.0
+updates_per_param_sync=1
+num_minibatches_per_update=1
+partial_rollout=False
+use_rollout_log_probs=True
+
+# Set to True to view per-trajectory gradient statistics
+update_policy_per_traj=False
+
+# ================= Training/Rollout Steps =================
+total_rollout_steps=$((500 * num_minibatches_per_update * updates_per_param_sync * train_prompt_mini_bsz))
+epochs=10000000
+test_freq=10
+save_freq=-1
+
+# ================= Logging =================
+exp_name="Synchronous GSM8k Qwen2-1.5B ${n_gpus_rollout}-${n_gpus_training} ${loss_agg_mode} ${max_response_length}-len ${weight_decay}-wd"
+exp_name_safe=${exp_name//\//_}
+log_dir="logs/${exp_name_safe}"
+CKPTS_DIR="${log_dir}"
+
+trainer_logger="['console','wandb']"
+log_val_generations=0
+wandb_entity=${wandb_entity:-""}
+wandb_group=${wandb_group:-"vcpo-release"}
+val_before_train=False
+
+# ================= LR decay =================
+lr_decay_style="constant"
+lr_decay_steps=${total_rollout_steps}
+
+# ================= Run =================
+python -m recipe.fully_async_policy.fully_async_main \
+    --config-name=fully_async_ppo_megatron_trainer.yaml \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    data.filter_overlong_prompts=True \
+    data.filter_overlong_prompts_workers=8 \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    algorithm.rollout_correction.rollout_is=${rollout_is} \
+    algorithm.rollout_correction.rollout_is_threshold=${rollout_is_threshold} \
+    algorithm.rollout_correction.rollout_rs=${rollout_rs} \
+    algorithm.rollout_correction.rollout_rs_threshold=${rollout_rs_threshold} \
+    actor_rollout_ref.actor.strategy=megatron \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=${clip_ratio_c} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.use_remove_padding=${use_remove_padding} \
+    actor_rollout_ref.hybrid_engine=False \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp} \
+    actor_rollout_ref.actor.megatron.sequence_parallel=${sequence_parallel} \
+    actor_rollout_ref.actor.megatron.dtype=${precision_dtype} \
+    actor_rollout_ref.actor.megatron.use_remove_padding=${use_remove_padding} \
+    actor_rollout_ref.actor.megatron.param_offload=False \
+    actor_rollout_ref.actor.megatron.optimizer_offload=False \
+    actor_rollout_ref.actor.megatron.grad_offload=False \
+    actor_rollout_ref.actor.optim.lr=${lr} \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=${lr_warmup_steps} \
+    actor_rollout_ref.actor.optim.lr_decay_style=${lr_decay_style} \
+    actor_rollout_ref.actor.optim.lr_decay_steps=${lr_decay_steps} \
+    actor_rollout_ref.actor.optim.weight_decay=${weight_decay} \
+    actor_rollout_ref.actor.optim.clip_grad=${grad_clip} \
+    actor_rollout_ref.actor.entropy_coeff=${entropy_coeff} \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.use_rollout_log_probs=${use_rollout_log_probs} \
+    actor_rollout_ref.actor.update_policy_per_traj=${update_policy_per_traj} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp} \
+    actor_rollout_ref.ref.megatron.sequence_parallel=${sequence_parallel} \
+    actor_rollout_ref.ref.megatron.dtype=${precision_dtype} \
+    actor_rollout_ref.ref.megatron.use_remove_padding=${use_remove_padding} \
+    actor_rollout_ref.ref.megatron.param_offload=True \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.dtype=${precision_dtype} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=${enable_chunked_prefill} \
+    actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens} \
+    actor_rollout_ref.rollout.temperature=1.0 \
+    actor_rollout_ref.rollout.top_p=1.0 \
+    actor_rollout_ref.rollout.top_k=-1 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=0.8 \
+    actor_rollout_ref.rollout.val_kwargs.top_p=0.7 \
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=3 \
+    actor_rollout_ref.rollout.calculate_log_probs=${calculate_log_probs} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \
+    critic.megatron.tensor_model_parallel_size=${train_tp} \
+    critic.megatron.pipeline_model_parallel_size=${train_pp} \
+    critic.megatron.context_parallel_size=${train_cp} \
+    critic.megatron.sequence_parallel=${sequence_parallel} \
+    critic.megatron.dtype=${precision_dtype} \
+    trainer.logger=${trainer_logger} \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    +trainer.wandb_entity="${wandb_entity}" \
+    +trainer.wandb_group="${wandb_group}" \
+    trainer.val_before_train=${val_before_train} \
+    trainer.save_freq=${save_freq} \
+    trainer.rollout_data_dir="${log_dir}" \
+    trainer.log_val_generations=${log_val_generations} \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs="${epochs}" \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${updates_per_param_sync}" \
+    async_training.require_batches="${num_minibatches_per_update}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.compute_prox_log_prob=True \
+    async_training.use_rollout_log_probs="${use_rollout_log_probs}" \
+    2>&1 | tee -a "${run_log_file}"