feat: Add CISPO (Clipped IS-weight Policy Optimization)

kekmodel · kekmodel · commit 0edbebf34c65 · 2025-11-07T00:45:32.000+09:00
Add support for CISPO algorithm from MiniMax-M1 paper, which addresses
PPO/GRPO's limitation of clipping out low-probability reasoning tokens.

Changes:
- Add compute_cispo_loss() in slime/utils/ppo_utils.py
- Add 'cispo' to advantage_estimator choices
- Update reward normalization to include CISPO
- Use CISPO loss when advantage_estimator='cispo'

Key implementation details:
- Token-level IS with stop-gradient on clipped ratios
- Explicit log probability: ratio_sg * advantages * log_probs
- Upper-only clipping with default eps_clip_high=5.0
- Direct clipfrac calculation: (ratio &gt; eps_clip_high)

Reference: MiniMax-M1 paper (arxiv:2506.13585)
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -484,7 +484,7 @@ def _train_step(self, packed_batch, world_size, reported_accum, mbs_id, grad_acc
             temperature=self.args.rollout_temperature,
         )
         packed_batch["cur_log_probs"] = log_probs
-        
+
         shifted_logits = logits.squeeze(0)[:-1]
         log_probs_full = torch.log_softmax(shifted_logits, dim=-1)
         probs = torch.softmax(shifted_logits, dim=-1)
@@ -554,7 +554,7 @@ def _train_step(self, packed_batch, world_size, reported_accum, mbs_id, grad_acc
 
         entropy = torch.cat([batch["entropy"] for batch in unpacked_batches], dim=0)
         entropy_loss = sum_of_sample_mean(entropy, response_lengths, loss_masks)
-        
+
         loss = pg_loss - self.args.entropy_coef * entropy_loss
 
         if self.args.use_kl_loss:
diff --git a/slime/backends/megatron_utils/loss.py b/slime/backends/megatron_utils/loss.py
@@ -10,6 +10,7 @@
 from slime.utils.ppo_utils import (
     calculate_log_probs_and_entropy,
     compute_approx_kl,
+    compute_cispo_loss,
     compute_policy_loss,
     get_advantages_and_returns,
     get_grpo_returns,
@@ -236,7 +237,7 @@ def compute_advantages_and_returns(args: Namespace, rollout_data: RolloutBatch)
             for i in range(len(log_probs))
         ]
 
-    if args.advantage_estimator in ["grpo", "gspo"]:
+    if args.advantage_estimator in ["grpo", "gspo", "cispo"]:
         rewards = torch.tensor(rewards, dtype=torch.float32, device=kl[0].device)
         returns = get_grpo_returns(rewards, kl)
         # TODO: is the copy necessary?
@@ -416,7 +417,11 @@ def policy_loss_function(
         log_probs = torch.cat(log_probs, dim=0)
         ppo_kl = old_log_probs - log_probs
 
-    pg_loss, pg_clipfrac = compute_policy_loss(ppo_kl, advantages, args.eps_clip, args.eps_clip_high)
+    # Compute policy loss: CISPO uses upper truncation with stop-gradient
+    if args.advantage_estimator == "cispo":
+        pg_loss, pg_clipfrac = compute_cispo_loss(ppo_kl, log_probs, advantages, args.eps_clip_high)
+    else:
+        pg_loss, pg_clipfrac = compute_policy_loss(ppo_kl, advantages, args.eps_clip, args.eps_clip_high)
 
     # Apply off-policy correction using importance sampling if enabled
     if args.use_tis:
diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py
@@ -180,7 +180,7 @@ def _post_process_rewards(self, samples: Union[list[Sample], list[list[Sample]]]
 
         raw_rewards = [sample.get_reward_value(self.args) for sample in samples]
         if (
-            self.args.advantage_estimator in ["grpo", "gspo", "reinforce_plus_plus_baseline"]
+            self.args.advantage_estimator in ["grpo", "gspo", "cispo", "reinforce_plus_plus_baseline"]
             and self.args.rewards_normalization
         ):
             # group norm
@@ -193,7 +193,7 @@ def _post_process_rewards(self, samples: Union[list[Sample], list[list[Sample]]]
             mean = rewards.mean(dim=-1, keepdim=True)
             rewards = rewards - mean
 
-            if self.args.advantage_estimator in ["grpo", "gspo"] and self.args.grpo_std_normalization:
+            if self.args.advantage_estimator in ["grpo", "gspo", "cispo"] and self.args.grpo_std_normalization:
                 std = rewards.std(dim=-1, keepdim=True)
                 rewards = rewards / (std + 1e-6)
 
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -672,7 +672,7 @@ def add_algo_arguments(parser):
             parser.add_argument(
                 "--advantage-estimator",
                 type=str,
-                choices=["grpo", "gspo", "reinforce_plus_plus", "reinforce_plus_plus_baseline", "ppo"],
+                choices=["grpo", "gspo", "cispo", "reinforce_plus_plus", "reinforce_plus_plus_baseline", "ppo"],
                 default="grpo",
             )
             parser.add_argument(
diff --git a/slime/utils/ppo_utils.py b/slime/utils/ppo_utils.py
@@ -72,6 +72,57 @@ def compute_policy_loss(
     return pg_losses, clipfrac
 
 
+@torch.compile(dynamic=True)
+def compute_cispo_loss(
+    ppo_kl: torch.Tensor,
+    log_probs: torch.Tensor,
+    advantages: torch.Tensor,
+    eps_clip_high: float,
+):
+    """Compute CISPO (Clipped IS-weight Policy Optimization) loss.
+
+    CISPO applies upper truncation on the importance sampling ratio with
+    stop-gradient, preventing the ratio itself from being learned. This differs
+    from PPO which uses both upper and lower clipping without stop-gradient.
+
+    The key formula from the paper:
+        ratio = exp(log π_current - log π_old)
+        ratio_truncated = min(ratio, ε_max)
+        loss = -sg(ratio_truncated) * advantages * log(π_current)
+
+    Note: log_probs is explicitly multiplied so gradient flows through it,
+    while ratio_sg is detached to prevent learning the ratio itself.
+
+    Args:
+        ppo_kl: Log-ratio (log π_old - log π_current) for each token
+        log_probs: Current policy log probabilities (requires gradient)
+        advantages: Advantage estimates for each token
+        eps_clip_high: Upper bound for clipping (ε_max), typically 5.0 (absolute value)
+
+    Returns:
+        Tuple of (pg_losses, clipfrac) where:
+            - pg_losses: Per-token CISPO policy gradient losses
+            - clipfrac: Fraction of ratios that were clipped
+    """
+    # Compute importance sampling ratio: π_current / π_old
+    ratio = (-ppo_kl).exp()
+
+    # Upper truncation: min(ratio, ε_max) where ε_max is absolute value
+    ratio_truncated = torch.clamp(ratio, max=eps_clip_high)
+
+    # Stop-gradient: prevent the ratio from being learned (CISPO's key feature)
+    ratio_sg = ratio_truncated.detach()
+
+    # CISPO formula: sg(ratio) * advantages * log_probs
+    # This ensures gradient flows through log_probs but not through ratio
+    pg_losses = -ratio_sg * advantages * log_probs
+
+    # Track clipping fraction for monitoring
+    clipfrac = (ratio > eps_clip_high).float()
+
+    return pg_losses, clipfrac
+
+
 def compute_log_probs(logits: torch.Tensor, tokens: torch.Tensor, process_group: Optional[dist.ProcessGroup]):
     from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
 

Original file line number	Diff line number	Diff line change
`@@ -672,7 +672,7 @@ def add_algo_arguments(parser):`
`672`	`672`	`parser.add_argument(`
`673`	`673`	`"--advantage-estimator",`
`674`	`674`	`type=str,`
`675`		`- choices=["grpo", "gspo", "reinforce_plus_plus", "reinforce_plus_plus_baseline", "ppo"],`
	`675`	`+ choices=["grpo", "gspo", "cispo", "reinforce_plus_plus", "reinforce_plus_plus_baseline", "ppo"],`
`676`	`676`	`default="grpo",`
`677`	`677`	`)`
`678`	`678`	`parser.add_argument(`