File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -72,8 +72,10 @@ actor_rollout_ref:
7272 fsdp_config :
7373 wrap_policy :
7474 min_num_params : 0
75- param_offload : false
76- optimizer_offload : false
75+ # Offload to CPU — frees ~15 GB on A100 80GB. Slower per step but
76+ # avoids OOM with rollout + training + ref policy all on one card.
77+ param_offload : true
78+ optimizer_offload : true
7779 fsdp_size : -1
7880
7981 rollout :
@@ -84,8 +86,10 @@ actor_rollout_ref:
8486 prompt_length : 1024
8587 response_length : 1024
8688 dtype : bfloat16
87- # Leave 40% headroom for training activations.
88- gpu_memory_utilization : 0.6
89+ # 0.4 (was 0.6) gives back ~16 GB to training. Bump back up if rollout
90+ # batches start hitting their own OOM, but 0.4 is a safe starting point
91+ # for 1.5B + LoRA-merged + Adam state on a single A100.
92+ gpu_memory_utilization : 0.4
8993 ignore_eos : false
9094 enforce_eager : false
9195 free_cache_engine : true
@@ -103,7 +107,9 @@ actor_rollout_ref:
103107
104108 ref :
105109 fsdp_config :
106- param_offload : false
110+ # Frozen reference policy → offload its weights to CPU; only loaded for
111+ # KL computation each step. Saves ~3 GB.
112+ param_offload : true
107113 log_prob_micro_batch_size_per_gpu : 1
108114 log_prob_use_dynamic_bsz : false
109115 log_prob_max_token_len_per_gpu : 16384
Original file line number Diff line number Diff line change @@ -12,3 +12,6 @@ trainer:
1212 total_training_steps : 50 # ~30-60 min instead of multi-hour
1313 save_freq : 25 # one mid-run + one final checkpoint
1414 experiment_name : grpo-qwen-1.5b-smoke
15+ # Explicit so the post-train tar step in train_grpo.sh can find it via
16+ # standalone yaml.safe_load (which doesn't resolve Hydra inheritance).
17+ default_local_dir : results/grpo_checkpoints/qwen-1.5b-smoke
You can’t perform that action at this time.
0 commit comments