[algo] support SAPO (#572)

Kuangdd01 · gemini-code-assist[bot] · web-flow · commit 55b40c8c21c4 · 2025-12-29T23:27:36.000+08:00
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/assets/baselines.md b/assets/baselines.md
@@ -23,6 +23,7 @@ Welcome to contribute new data points!
 | 7B   | DAPO        | AMP  | 1e-6 | 1e-2 | 0.37 -> 0.50 (+0.13) |
 | 7B   | GSPO        | AMP  | 1e-6 |    0 | 0.37 -> 0.48 (+0.11) |
 | 7B   | CISPO       | AMP  | 1e-6 | 1e-2 | 0.37 -> 0.50 (+0.13) |
+| 7B   | SAPO        | AMP  | 1e-6 |    0 | 0.37 -> 0.54 (+0.17) |
 | 3B   | GRPO        | AMP  | 1e-6 | 1e-2 | 0.24 -> 0.38 (+0.14) |
 | 32B  | GRPO        | BF16 | 1e-6 | 1e-2 | 0.50 -> 0.56 (+0.06) |
 
diff --git a/examples/qwen2_5_vl_7b_geo3k_sapo.sh b/examples/qwen2_5_vl_7b_geo3k_sapo.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -x
+
+MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
+
+python3 -m verl.trainer.main \
+    config=examples/config.yaml \
+    data.train_files=hiyouga/geometry3k@train \
+    data.val_files=hiyouga/geometry3k@test \
+    worker.actor.model.model_path=${MODEL_PATH} \
+    worker.actor.loss_type=sapo \
+    algorithm.disable_kl=True \
+    trainer.experiment_name=qwen2_5_vl_7b_geo_sapo \
+    trainer.n_gpus_per_node=8
+
diff --git a/verl/trainer/core_algos.py b/verl/trainer/core_algos.py
@@ -415,7 +415,9 @@ def compute_policy_loss(
     clip_ratio_low: float,
     clip_ratio_high: float,
     clip_ratio_dual: float,
-    loss_type: Literal["default", "gspo", "gspo_token", "cispo"],
+    tau_positive: float,
+    tau_negative: float,
+    loss_type: Literal["default", "gspo", "gspo_token", "cispo", "sapo"],
     loss_avg_mode: Literal["token", "seq"],
     **kwargs,
 ) -> tuple[torch.Tensor, dict[str, float]]:
@@ -438,6 +440,10 @@ def compute_policy_loss(
             The higher clip range used in DAPO. See https://arxiv.org/pdf/2503.14476
         clip_ratio_dual: (float)
             The dual clip range used in Dual-clip PPO. See https://arxiv.org/pdf/1912.09729
+        tau_positive: (float)
+            The temperature for control the positive tokens' clipping in SAPO. See https://arxiv.org/pdf/2511.20347
+        tau_negative: (float)
+            The temperature for control the negative tokens' clipping in SAPO. See https://arxiv.org/pdf/2511.20347
         loss_avg_mode: (Literal["token", "seq"])
             "token": average the loss in the whole batch
             "seq": average the loss in each sequence then average the mean of the means
@@ -481,6 +487,12 @@ def compute_policy_loss(
 
     if loss_type == "cispo":
         final_pg_loss = -advantages * log_probs * clipped_ratio.detach()
+    elif loss_type == "sapo":
+        positive_token_mask =  (advantages >= 0).float()
+        negative_token_mask =  (advantages < 0).float()
+        gate_negative = 4.0 / tau_negative * torch.sigmoid(tau_negative * (ratio - 1.0))
+        gate_positive = 4.0 / tau_positive * torch.sigmoid(tau_positive * (ratio - 1.0))
+        final_pg_loss = -advantages * (positive_token_mask * gate_positive + negative_token_mask * gate_negative)
     else:
         pg_loss = -advantages * ratio  # -ratio * A
         pg_loss2 = -advantages * clipped_ratio  # -clip(ratio, 1-clip_low, 1+clip_high) * A
diff --git a/verl/workers/actor/config.py b/verl/workers/actor/config.py
@@ -104,6 +104,10 @@ class ActorConfig:
     """ulysses sequence parallel size"""
     use_torch_compile: bool = True
     """enable torch compile"""
+    tau_positive: float = 1.0
+    """temperature for positive tokens"""
+    tau_negative: float = 1.05
+    """temperature for negative tokens"""
     model: ModelConfig = field(default_factory=ModelConfig)
     optim: OptimConfig = field(default_factory=OptimConfig)
     fsdp: FSDPConfig = field(default_factory=FSDPConfig)
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
@@ -264,6 +264,8 @@ def update_policy(self, data: DataProto) -> dict[str, Any]:
                         clip_ratio_low=self.config.clip_ratio_low,
                         clip_ratio_high=self.config.clip_ratio_high,
                         clip_ratio_dual=self.config.clip_ratio_dual,
+                        tau_positive=self.config.tau_positive,
+                        tau_negative=self.config.tau_negative,
                         loss_type=self.config.loss_type,
                         loss_avg_mode=self.config.loss_avg_mode,
                     )

Original file line number	Diff line number	Diff line change
`@@ -264,6 +264,8 @@ def update_policy(self, data: DataProto) -> dict[str, Any]:`
`264`	`264`	`clip_ratio_low=self.config.clip_ratio_low,`
`265`	`265`	`clip_ratio_high=self.config.clip_ratio_high,`
`266`	`266`	`clip_ratio_dual=self.config.clip_ratio_dual,`
	`267`	`+ tau_positive=self.config.tau_positive,`
	`268`	`+ tau_negative=self.config.tau_negative,`
`267`	`269`	`loss_type=self.config.loss_type,`
`268`	`270`	`loss_avg_mode=self.config.loss_avg_mode,`
`269`	`271`	`)`