Merge pull request #28 from complex-reasoning/codex/pr23-safer-fixes

yifanzhang-pro · web-flow · commit c0574e46b74d · 2026-04-28T20:36:49.000-04:00
[codex] fix data indexing and clip metrics
diff --git a/process-data.py b/process-data.py
@@ -2,6 +2,15 @@
 import numpy as np
 import os # Import os for path manipulation
 
+
+def build_extra_info(value: object, index: int) -> dict[str, object]:
+    if isinstance(value, dict):
+        extra_info = dict(value)
+    else:
+        extra_info = {}
+    extra_info["index"] = index
+    return extra_info
+
 # --- Configuration ---
 # Define the directory containing the input file
 data_directory = 'data'
@@ -30,9 +39,7 @@
         dummy_df.to_parquet(input_parquet_path)
 
 
-    # Read the Parquet file into a pandas DataFrame
-    # We don't need the original index, so we can reset it immediately if needed,
-    # but setting df.index directly below overwrites it anyway.
+    # Read the Parquet file into a pandas DataFrame.
     print(f"Reading Parquet file from: {input_parquet_path}")
     df = pd.read_parquet(input_parquet_path)
     print("Original DataFrame info:")
@@ -45,23 +52,31 @@
     num_rows = len(df)
     print(f"\nDataFrame has {num_rows} rows.")
 
-    # Create a new sequential index starting from 1 up to the number of rows
-    # Name the new index 'extra_info' as requested
-    print("Generating new sequential index named 'extra_info' from 1...")
-    new_index = pd.RangeIndex(start=1, stop=num_rows + 1, step=1, name='extra_info')
+    # RLHFDataset reads row_dict["extra_info"]["index"], so store the repeat
+    # index inside the extra_info column rather than as a pandas index.
+    print("Generating 0-based extra_info.index values...")
+    if "extra_info" in df.columns:
+        existing_extra_info = df["extra_info"].tolist()
+    else:
+        existing_extra_info = [None] * num_rows
 
-    # Set the new index for the DataFrame, replacing the old one
-    df.index = new_index
-    print("New index assigned.")
+    df["extra_info"] = [
+        build_extra_info(value=value, index=index)
+        for index, value in enumerate(existing_extra_info)
+    ]
+    df = df.reset_index(drop=True)
+    print("extra_info.index assigned.")
 
     # Write the modified DataFrame back to a new Parquet file
-    # index=True ensures the new index ('extra_info') is written to the file
     print(f"Writing modified DataFrame to: {output_parquet_path}")
-    df.to_parquet(output_parquet_path, index=True)
+    df.to_parquet(output_parquet_path, index=False)
 
     print("\n--- Success ---")
     print(f"Successfully processed '{input_parquet_path}'.")
-    print(f"Created new index named 'extra_info' from 1 to {num_rows}.")
+    if num_rows:
+        print(f"Created 0-based extra_info.index values from 0 to {num_rows - 1}.")
+    else:
+        print("Created empty extra_info.index values.")
     print(f"Output saved to '{output_parquet_path}'.")
 
     # Display the first few rows with the new index to verify
diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
@@ -135,11 +135,12 @@ def compute_grpo_outcome_advantage(token_level_rewards: torch.Tensor,
             id2score[index[i]].append(scores[i])
         for idx in id2score:
             if len(id2score[idx]) == 1:
-                id2mean[idx] = torch.tensor(0.0)
-                id2std[idx] = torch.tensor(1.0)
+                id2mean[idx] = scores.new_tensor(0.0)
+                id2std[idx] = scores.new_tensor(1.0)
             elif len(id2score[idx]) > 1:
-                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
-                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
+                scores_tensor = torch.stack(id2score[idx])
+                id2mean[idx] = scores_tensor.mean()
+                id2std[idx] = scores_tensor.std()
             else:
                 raise ValueError(f"no score in prompt index: {idx}")
         for i in range(bsz):
@@ -522,7 +523,7 @@ def compute_policy_loss_reinforce(old_log_prob,
         ppo_kl: (float)
             the estimated KL divergence between the latest updating policy and the old sampling policy
         pg_clipfrac_lower: (float)
-            the fraction of policy gradient loss being clipped when the advantage is negative
+            the fraction of policy gradient loss being clipped at the lower bound
     """
 
     negative_approx_kl = log_prob - old_log_prob
@@ -567,9 +568,12 @@ def compute_policy_loss_reinforce(old_log_prob,
     else:
         A = (advantages * w_ + kl_term).detach()
         pg_losses = -A * log_prob
-        pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses, pg_losses).float(), response_mask)
-        pg_clipfrac_lower = verl_F.masked_mean(
-            torch.gt(pg_losses, pg_losses) * (advantages < 0).float(), response_mask)
+        # This branch uses hard-clipped importance weights in A, so report how
+        # often w falls outside the clamp bounds.
+        lower_clipped = w < (1 - clip_ratio_low)
+        upper_clipped = w > (1 + clip_ratio_high)
+        pg_clipfrac = verl_F.masked_mean((lower_clipped | upper_clipped).float(), response_mask)
+        pg_clipfrac_lower = verl_F.masked_mean(lower_clipped.float(), response_mask)
             
     pg_loss = agg_loss(loss_mat=pg_losses, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)