fix: Update gradient difference to match inverted formula

Kyle1668 · claude · Kyle1668 · commit 9bf30dd31bde · 2025-09-26T00:04:44.000Z
Updated documentation and tests to match the inverted gradient difference formula: L_total = α * L_retain - L_forget Now gd_retain_weight semantics are intuitive: - Higher values (40-100) = more retention, less forgetting - Lower values (1-10) = more aggressive unlearning Updated test expectations to match new formula 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -1172,7 +1172,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     """
     Enable gradient difference mode. When enabled, the model performs gradient
     difference unlearning using both forget (GA) and retain datasets. This replaces
-    pure gradient ascent with the formula: L_total = L_retain - α * L_forget
+    pure gradient ascent with the formula: L_total = α * L_retain - L_forget
     """
 
     gd_retain_dataset: str = None
@@ -1189,10 +1189,11 @@ class NeoXArgsTraining(NeoXArgsTemplate):
 
     gd_retain_weight: float = 40.0
     """
-    Weight (α) for the forget loss in gradient difference formula.
-    Higher values provide stronger retention of general capabilities.
-    The combined loss is: L_retain - α * L_forget
-    Based on Composable Interventions paper, values around 40 work well.
+    Weight (α) for the retain loss in gradient difference formula.
+    Higher values provide stronger retention of general capabilities (less forgetting).
+    Lower values allow more aggressive unlearning (more forgetting).
+    The combined loss is: α * L_retain - L_forget
+    Typical values: 1-10 for aggressive unlearning, 40-100 for balanced unlearning.
     """
 
     gd_log_separate_losses: bool = True
diff --git a/tests/unit/test_gradient_ascent.py b/tests/unit/test_gradient_ascent.py
@@ -1125,11 +1125,11 @@ def test_gradient_difference_loss_formula(self):
         alpha = 40.0
 
         # Compute gradient difference loss
-        # L_total = L_retain - α * L_forget
-        gd_loss = retain_loss - alpha * forget_loss
+        # L_total = α * L_retain - L_forget
+        gd_loss = alpha * retain_loss - forget_loss
 
-        # Expected: 3.0 - 40.0 * 2.5 = 3.0 - 100.0 = -97.0
-        assert gd_loss.item() == -97.0
+        # Expected: 40.0 * 3.0 - 2.5 = 120.0 - 2.5 = 117.5
+        assert gd_loss.item() == 117.5
 
     def test_gradient_difference_direction(self):
         """Test that gradient difference moves in correct directions."""
@@ -1144,7 +1144,7 @@ def test_gradient_difference_direction(self):
 
         # Gradient difference objective
         alpha = 1.0
-        combined_loss = retain_loss - alpha * forget_loss
+        combined_loss = alpha * retain_loss - forget_loss
 
         # Compute gradients
         combined_loss.backward()