fix: enable param/optimizer offload + lower vllm mem util; OOM was killing workers

Devesh-Maheshwari · Devesh-Maheshwari · commit 7b83d83761b7 · 2026-04-25T09:02:27.000-05:00
diff --git a/configs/grpo_qwen1_5b.yaml b/configs/grpo_qwen1_5b.yaml
@@ -72,8 +72,10 @@ actor_rollout_ref:
     fsdp_config:
       wrap_policy:
         min_num_params: 0
-      param_offload: false
-      optimizer_offload: false
+      # Offload to CPU — frees ~15 GB on A100 80GB. Slower per step but
+      # avoids OOM with rollout + training + ref policy all on one card.
+      param_offload: true
+      optimizer_offload: true
       fsdp_size: -1
 
   rollout:
@@ -84,8 +86,10 @@ actor_rollout_ref:
     prompt_length: 1024
     response_length: 1024
     dtype: bfloat16
-    # Leave 40% headroom for training activations.
-    gpu_memory_utilization: 0.6
+    # 0.4 (was 0.6) gives back ~16 GB to training. Bump back up if rollout
+    # batches start hitting their own OOM, but 0.4 is a safe starting point
+    # for 1.5B + LoRA-merged + Adam state on a single A100.
+    gpu_memory_utilization: 0.4
     ignore_eos: false
     enforce_eager: false
     free_cache_engine: true
@@ -103,7 +107,9 @@ actor_rollout_ref:
 
   ref:
     fsdp_config:
-      param_offload: false
+      # Frozen reference policy → offload its weights to CPU; only loaded for
+      # KL computation each step. Saves ~3 GB.
+      param_offload: true
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: false
     log_prob_max_token_len_per_gpu: 16384
diff --git a/configs/grpo_qwen1_5b_smoke.yaml b/configs/grpo_qwen1_5b_smoke.yaml
@@ -12,3 +12,6 @@ trainer:
   total_training_steps: 50      # ~30-60 min instead of multi-hour
   save_freq: 25                 # one mid-run + one final checkpoint
   experiment_name: grpo-qwen-1.5b-smoke
+  # Explicit so the post-train tar step in train_grpo.sh can find it via
+  # standalone yaml.safe_load (which doesn't resolve Hydra inheritance).
+  default_local_dir: results/grpo_checkpoints/qwen-1.5b-smoke