Support varlen with CP and datapacking for normal models

Williamren97 · PopSoda2002 · commit f32b57f43d65 · 2025-11-01T22:21:01.000Z
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -260,6 +260,9 @@ def compute_log_prob(
                 for batch in self.prof.iterate_train_log_probs(
                     tqdm(packed_batches, desc=f"{store_prefix}log_probs", disable=dist.get_rank() != 0)
                 ):
+                    # Update cu_seqlens for CP before forward pass
+                    if self.args.enable_cp:
+                        self._update_cp_cu_seqlens(batch)
                     with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                         model_args = {
                             "input_ids": batch["tokens"].unsqueeze(0),
diff --git a/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh b/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh
@@ -96,6 +96,13 @@ FSDP_ARGS=(
    --update-weights-buffer-size $((512 * 1024 * 1024)) # 512MB
 )
 
+# Context Parallelism Arguments
+# Uncomment to enable CP with varlen support and data packing
+CP_ARGS=(
+   # --enable-cp                        # Enable Context Parallelism
+   # --ring-flash-atten-type llama3    # Use llama3 ring attention implementation
+)
+
 # launch the master node of ray in container
 ray start --head --node-ip-address 127.0.0.1 --num-gpus 2 --disable-usage-stats
 
@@ -115,4 +122,6 @@ ray job submit --address="http://127.0.0.1:8265" \
    ${OPTIMIZER_ARGS[@]} \
    ${GRPO_ARGS[@]} \
    ${SGLANG_ARGS[@]} \
-   ${WANDB_ARGS[@]} 
+   ${WANDB_ARGS[@]} \
+   ${FSDP_ARGS[@]} \
+   ${CP_ARGS[@]}