Support varlen with CP and datapacking for normal models

Williamren97 · Williamren97 · commit 88afbd400b16 · 2025-10-14T23:20:57.000+08:00
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -160,6 +160,10 @@ def compute_log_prob(
             rollout_data = {f"{store_prefix}log_probs": []}
             with timer(f"{store_prefix}log_probs") and torch.no_grad():
                 for batch in packed_batches:
+                    # Update cu_seqlens for CP before forward pass
+                    if self.args.enable_cp:
+                        self._update_cp_cu_seqlens(batch)
+
                     with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                         model_args = {
                             "input_ids": batch["tokens"].unsqueeze(0),
diff --git a/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh b/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh
@@ -93,6 +93,13 @@ FSDP_ARGS=(
    --update-weights-buffer-size $((512 * 1024 * 1024)) # 512MB
 )
 
+# Context Parallelism Arguments
+# Uncomment to enable CP with varlen support and data packing
+CP_ARGS=(
+   # --enable-cp                        # Enable Context Parallelism
+   # --ring-flash-atten-type llama3    # Use llama3 ring attention implementation
+)
+
 # launch the master node of ray in container
 ray start --head --node-ip-address 127.0.0.1 --num-gpus 2 --disable-usage-stats
 
@@ -112,4 +119,6 @@ ray job submit --address="http://127.0.0.1:8265" \
    ${OPTIMIZER_ARGS[@]} \
    ${GRPO_ARGS[@]} \
    ${SGLANG_ARGS[@]} \
-   ${WANDB_ARGS[@]} 
+   ${WANDB_ARGS[@]} \
+   ${FSDP_ARGS[@]} \
+   ${CP_ARGS[@]}