adding kl metircs

zhaochenyang20 · zhaochenyang20 · commit 0695c73d8532 · 2025-10-06T18:35:05.000Z
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -18,13 +18,14 @@
     raise ImportError("FSDP v2 not available")
 
 import wandb
+
 from slime.ray.train_actor import TrainRayActor
 from slime.utils.data import get_minimum_num_micro_batch_size, process_rollout_data
 from slime.utils.distributed_utils import get_gloo_group
 from slime.utils.memory_utils import clear_memory
 from slime.utils.ppo_utils import compute_approx_kl, compute_policy_loss
 from slime.utils.timer import Timer, timer
-from slime.utils.tis import compute_tis_weights
+from slime.utils.tis import compute_kl_metrics, compute_tis_weights
 from slime.utils.wandb_utils import init_wandb_secondary
 
 from .data_packing import pack_sequences, unpack_sequences
@@ -336,7 +337,6 @@ def train(self, rollout_id, rollout_data_ref):
                 rollout_log_probs = torch.cat([batch["rollout_log_probs"] for batch in unpacked_batches], dim=0).to(
                     device=log_probs.device
                 )
-                old_log_probs_flat = old_log_probs
 
                 # Build eos mask from loss masks
                 eos_mask = torch.cat(loss_masks, dim=0).to(device=log_probs.device)
@@ -349,7 +349,7 @@ def train(self, rollout_id, rollout_data_ref):
                     lower = getattr(self.args, "tis_clip_low", 0.0)
 
                 tis_weights, tis_metrics = compute_tis_weights(
-                    old_log_prob=old_log_probs_flat,
+                    old_log_prob=old_log_probs,
                     rollout_log_prob=rollout_log_probs,
                     eos_mask=eos_mask,
                     level=getattr(self.args, "tis_level", "token"),
@@ -365,6 +365,14 @@ def train(self, rollout_id, rollout_data_ref):
                 if tis_weights is not None:
                     pg_loss = pg_loss * tis_weights
 
+                # KL metrics next to TIS metrics
+                kl_metrics = compute_kl_metrics(
+                    old_log_prob=old_log_probs,
+                    rollout_log_prob=rollout_log_probs,
+                    eos_mask=eos_mask,
+                    response_lengths=response_lengths,
+                )
+
             pg_loss = sum_of_sample_mean(pg_loss, response_lengths, loss_masks)
             pg_clipfrac = sum_of_sample_mean(pg_clipfrac, response_lengths, loss_masks)
             ppo_kl = sum_of_sample_mean(ppo_kl.abs(), response_lengths, loss_masks)
@@ -399,20 +407,9 @@ def train(self, rollout_id, rollout_data_ref):
 
             if self.args.use_tis and tis_weights is not None:
                 reported["ois"] = sum_of_sample_mean(ois, response_lengths, loss_masks).detach()
-                # Extended metrics
-                for k in [
-                    "tis_mean",
-                    "tis_std",
-                    "tis_ratio_fraction_high",
-                    "tis_ratio_fraction_low",
-                    "tis_seq_clipped_fraction",
-                    "tis_veto_fraction",
-                ]:
-                    if k in tis_metrics:
-                        val = tis_metrics[k]
-                        reported[k] = (
-                            val.detach() if torch.is_tensor(val) else torch.tensor(val, device=log_probs.device)
-                        )
+                # Report all TIS and KL metrics uniformly
+                for k, v in {**tis_metrics, **kl_metrics}.items():
+                    reported[k] = v.detach() if torch.is_tensor(v) else torch.tensor(v, device=log_probs.device)
 
             # Scale loss for gradient accumulation
             loss = loss * dist.get_world_size() / self.args.global_batch_size
diff --git a/slime/backends/megatron_utils/loss.py b/slime/backends/megatron_utils/loss.py
@@ -14,7 +14,7 @@
     get_reinforce_plus_plus_baseline_advantages,
     get_reinforce_plus_plus_returns,
 )
-from slime.utils.tis import compute_tis_weights
+from slime.utils.tis import compute_kl_metrics, compute_tis_weights
 
 from .cp_utils import all_gather_with_cp, get_logits_and_tokens_offset_with_cp, get_sum_of_sample_mean
 
@@ -309,7 +309,7 @@ def policy_loss_function(args, batch, logits, sum_of_sample_mean):
     if args.use_tis:
         assert "rollout_log_probs" in batch, "rollout_log_probs must be provided for TIS"
         rollout_log_probs = torch.cat(batch["rollout_log_probs"], dim=0)
-        old_log_probs_flat = torch.cat(batch["log_probs"], dim=0)
+        old_log_probs = torch.cat(batch["log_probs"], dim=0)
 
         # Build eos mask from loss masks (concatenated) to match flattened tensors
         eos_mask = torch.cat(batch["loss_masks"], dim=0).to(device=log_probs.device)
@@ -323,7 +323,7 @@ def policy_loss_function(args, batch, logits, sum_of_sample_mean):
         )
 
         tis_weights, tis_metrics = compute_tis_weights(
-            old_log_prob=old_log_probs_flat,
+            old_log_prob=old_log_probs,
             rollout_log_prob=rollout_log_probs,
             eos_mask=eos_mask,
             level=getattr(args, "tis_level", "token"),
@@ -340,6 +340,14 @@ def policy_loss_function(args, batch, logits, sum_of_sample_mean):
         if tis_weights is not None:
             pg_loss = pg_loss * tis_weights
 
+        # KL metrics next to TIS metrics
+        kl_metrics = compute_kl_metrics(
+            old_log_prob=old_log_probs,
+            rollout_log_prob=rollout_log_probs,
+            eos_mask=eos_mask,
+            response_lengths=batch["response_lengths"],
+        )
+
     pg_loss = sum_of_sample_mean(pg_loss)
     pg_clipfrac = sum_of_sample_mean(pg_clipfrac)
     ppo_kl = sum_of_sample_mean(ppo_kl)
@@ -381,20 +389,9 @@ def policy_loss_function(args, batch, logits, sum_of_sample_mean):
     if args.use_tis:
         # Backward compatible basic logs
         reported_loss["ois"] = sum_of_sample_mean(ois).clone().detach()
-        # Extended metrics from generalized TIS
-        for k in [
-            "tis_mean",
-            "tis_std",
-            "tis_ratio_fraction_high",
-            "tis_ratio_fraction_low",
-            "tis_seq_clipped_fraction",
-            "tis_veto_fraction",
-        ]:
-            if k in tis_metrics:
-                val = tis_metrics[k]
-                reported_loss[k] = (
-                    val.clone().detach() if torch.is_tensor(val) else torch.tensor(val, device=logits.device)
-                )
+        # Report all TIS and KL metrics uniformly
+        for k, v in {**tis_metrics, **kl_metrics}.items():
+            reported_loss[k] = v.clone().detach() if torch.is_tensor(v) else torch.tensor(v, device=logits.device)
 
     return loss, reported_loss
 
diff --git a/slime/utils/tis.py b/slime/utils/tis.py
@@ -107,6 +107,55 @@ def compute_is_metrics(
     return metrics
 
 
+def compute_kl_metrics(
+    *,
+    old_log_prob: torch.Tensor,
+    rollout_log_prob: torch.Tensor,
+    eos_mask: Optional[torch.Tensor],
+    response_lengths: Optional[list[int]] = None,
+) -> Dict[str, Any]:
+    metrics: Dict[str, Any] = {}
+
+    device = old_log_prob.device
+    if eos_mask is None:
+        eos_mask = torch.ones_like(old_log_prob, dtype=torch.bool, device=device)
+
+    # Direct estimator for KL(pi_rollout || pi_old): E[log pi_rollout - log pi_old]
+    metrics["rollout_kl"] = masked_mean(rollout_log_prob - old_log_prob, eos_mask)
+
+    # K3 estimator: E[exp(log(pi_old/pi_rollout)) - log(pi_old/pi_rollout) - 1]
+    log_ratio = old_log_prob - rollout_log_prob
+    k3_matrix = torch.exp(log_ratio) - log_ratio - 1
+    metrics["rollout_k3_kl"] = masked_mean(k3_matrix, eos_mask)
+
+    # Sequence-level perplexity difference metrics
+    if old_log_prob.dim() == 2:
+        mean_log_prob_rollout_per_seq = masked_mean(rollout_log_prob, eos_mask, dim=-1)
+        mean_log_prob_old_per_seq = masked_mean(old_log_prob, eos_mask, dim=-1)
+    elif response_lengths is not None and len(response_lengths) > 0 and old_log_prob.dim() == 1:
+        seq_rollout_means = []
+        seq_old_means = []
+        start = 0
+        for length in response_lengths:
+            end = start + int(length)
+            mask_chunk = eos_mask[start:end] if eos_mask is not None else None
+            seq_rollout_means.append(masked_mean(rollout_log_prob[start:end], mask_chunk))
+            seq_old_means.append(masked_mean(old_log_prob[start:end], mask_chunk))
+            start = end
+        mean_log_prob_rollout_per_seq = torch.stack(seq_rollout_means)
+        mean_log_prob_old_per_seq = torch.stack(seq_old_means)
+    else:
+        # Fallback to global means if sequence boundaries are unavailable
+        mean_log_prob_rollout_per_seq = masked_mean(rollout_log_prob, eos_mask).unsqueeze(0)
+        mean_log_prob_old_per_seq = masked_mean(old_log_prob, eos_mask).unsqueeze(0)
+
+    diff = mean_log_prob_rollout_per_seq - mean_log_prob_old_per_seq
+    metrics["log_ppl_diff"] = diff.mean()
+    metrics["log_ppl_abs_diff"] = diff.abs().mean()
+
+    return metrics
+
+
 def compute_tis_weights(
     *,
     old_log_prob: torch.Tensor,