Modify Muon optimizer (#21885)

pass-lin · web-flow · commit d130816c4a7f · 2025-12-06T18:34:30.000-08:00
* modify muon.

* modify gemini review.

* modify
diff --git a/keras/src/optimizers/muon.py b/keras/src/optimizers/muon.py
@@ -20,7 +20,7 @@ class Muon(optimizer.Optimizer):
     The Muon optimizer can use both the Muon update step or the
     AdamW update step based on the following:
 
-    - For any variable that isn't 2D, 3D or 4D, the AdamW step
+    - For any variable that isn't 2D, the AdamW step
         will be used. This is not configurable.
     - If the argument `exclude_embeddings` (defaults to `True`) is set
     to `True`, the AdamW step will be used.
@@ -46,10 +46,12 @@ class Muon(optimizer.Optimizer):
             that takes no arguments and returns the actual value to use.
             The exponential decay rate for the 1st moment estimates. Defaults to
             `0.9`.
-        adam_beta_2: A float value or a constant float tensor, ora callable
+        adam_beta_2: A float value or a constant float tensor, or a callable
             that takes no arguments and returns the actual value to use.
             The exponential decay rate for the 2nd moment estimates. Defaults to
             `0.999`.
+        adam_weight_decay: Float. If set, weight decay is applied when using
+            the Adam optimizer.
         epsilon: A small constant for numerical stability. This is
             "epsilon hat" in the Kingma and Ba paper
             (in the formula just before Section 2.1),
@@ -67,20 +69,25 @@ class Muon(optimizer.Optimizer):
             It is recommended to use the default value
         adam_lr_ratio: Float, the ratio of the learning rate when
                 using Adam to the main learning rate.
-                it is recommended to set it to 0.1
+                It is recommended to set it to 1
         momentum: Float, momentum used by internal SGD.
         ns_steps: Integer, number of Newton-Schulz iterations to run.
         nesterov: Boolean, whether to use Nesterov-style momentum
         {{base_optimizer_keyword_args}}
+        rms_rate: Float. A parameter from https://arxiv.org/abs/2502.16982
+            that can enhance the stability of Muon, allowing it to use the
+            same learning rate and weight decay as Adam. Defaults to `0.2`.
+            Set to `None` to disable this feature.
     """
 
     def __init__(
         self,
         learning_rate=0.001,
         adam_beta_1=0.9,
         adam_beta_2=0.999,
+        adam_weight_decay=0.004,
         epsilon=1e-7,
-        weight_decay=0.1,
+        weight_decay=0.004,
         clipnorm=None,
         clipvalue=None,
         global_clipnorm=None,
@@ -95,10 +102,11 @@ def __init__(
         muon_a=3.4445,
         muon_b=-4.7750,
         muon_c=2.0315,
-        adam_lr_ratio=0.1,
+        adam_lr_ratio=1,
         momentum=0.95,
-        ns_steps=6,
+        ns_steps=5,
         nesterov=True,
+        rms_rate=0.2,
         **kwargs,
     ):
         super().__init__(
@@ -127,12 +135,13 @@ def __init__(
         self.nesterov = nesterov
         self.exclude_embeddings = exclude_embeddings
         self.exclude_layers = exclude_layers or []
+        self.adam_weight_decay = adam_weight_decay
+        self.rms_rate = rms_rate
 
     def _should_use_adamw(self, variable):
-        # To use it with 4D convolutional filters,
         # it works well to just flatten their last 3 dimensions.
         # any {0,1}-D parameters should all be optimized by adam
-        if not 1 < len(variable.shape) < 4:
+        if len(variable.shape) != 2:
             return True
         if self.exclude_embeddings and "embedding" in variable.path.lower():
             return True
@@ -185,18 +194,13 @@ def update_step(self, gradient, variable, learning_rate):
     def _muon_update_step(self, gradient, variable, lr):
         m = self.adam_momentums[variable.path]
         self.assign_add(m, ops.add(gradient, m * (self.momentum - 1)))
-        shape = variable.shape
         if self.nesterov:
             g = ops.add(gradient, self.momentum * m)
         else:
             g = m
+        update = self.zeropower_via_newtonschulz5(g, self.ns_steps)
 
-        self.assign_sub(
-            variable,
-            lr
-            * self.zeropower_via_newtonschulz5(g, self.ns_steps)
-            * max(1, shape[0] / shape[1]) ** 0.5,
-        )
+        self.assign_sub(variable, self.lr_adjust(lr * update))
 
     def _adamw_update_step(self, gradient, variable, learning_rate):
         """Update step given gradient and the associated model variable."""
@@ -239,6 +243,20 @@ def transpose_last_axis(self, X):
         X = ops.transpose(X, temp_order)
         return X
 
+    def lr_adjust(self, x):
+        """Adjusts learning rate based on the Moonlight implementation.
+        This method enhances the stability of Muon, allowing it to use the same
+        learning rate and weight decay as Adam. For details, see
+        https://arxiv.org/abs/2502.16982.
+        For a 2D matrix, the update is scaled by `sqrt(max(n, m)) * rms_rate`,
+        where `n` and `m` are the dimensions of the matrix.
+        """
+        if self.rms_rate is None:
+            return x
+        # moonlight version
+        # https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
+        return x * ops.sqrt(ops.maximum(x.shape[0], x.shape[1])) * self.rms_rate
+
     def zeropower_via_newtonschulz5(self, x, steps: int):
         """We apply the Newton-Schulz iteration to compute matrix G.
 
@@ -268,6 +286,20 @@ def zeropower_via_newtonschulz5(self, x, steps: int):
             x = self.transpose_last_axis(x)
         return x
 
+    def _apply_weight_decay(self, variables):
+        for variable in variables:
+            if not self._use_weight_decay(variable):
+                continue
+            if self._should_use_adamw(variable):
+                weight_decay_value = self.adam_weight_decay
+            else:
+                weight_decay_value = self.weight_decay
+            if weight_decay_value is None:
+                continue
+            wd = ops.cast(weight_decay_value, variable.dtype)
+            lr = ops.cast(self.learning_rate, variable.dtype)
+            variable.assign(variable - variable * wd * lr)
+
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -284,6 +316,8 @@ def get_config(self):
                 "ns_steps": self.ns_steps,
                 "nesterov": self.nesterov,
                 "exclude_embeddings": self.exclude_embeddings,
+                "adam_weight_decay": self.adam_weight_decay,
+                "rms_rate": self.rms_rate,
             }
         )
         return config
diff --git a/keras/src/optimizers/muon_test.py b/keras/src/optimizers/muon_test.py
@@ -74,7 +74,10 @@ def test_muon_single_step(self):
         optimizer.build([vars])
         optimizer._muon_update_step(grads, vars, 0.5)
         self.assertAllClose(
-            vars, [[1.13, 1.51], [2.57, 4.06]], rtol=1e-2, atol=1e-2
+            vars,
+            [[0.988775, 1.887053], [2.873428, 3.97035]],
+            rtol=1e-2,
+            atol=1e-2,
         )
 
     def test_clip_norm(self):
@@ -88,3 +91,32 @@ def test_clip_value(self):
         grad = [np.array([100.0, 100.0])]
         clipped_grad = optimizer._clip_gradients(grad)
         self.assertAllClose(clipped_grad[0], [1.0, 1.0])
+
+    def test_muon_weight_decay(self):
+        variable = backend.Variable([[1.0, 2.0], [3.0, 4.0]])
+        weight_decay = 0.01
+        expected_variable = variable - variable * weight_decay
+        optimizer = Muon(learning_rate=1.0, weight_decay=weight_decay)
+        optimizer._apply_weight_decay([variable])
+        self.assertAllClose(variable, expected_variable, rtol=1e-4, atol=1e-4)
+
+    def test_adamw_weight_decay(self):
+        variable = backend.Variable(2.0)
+        weight_decay = 0.01
+        expected_variable = variable - variable * weight_decay
+        optimizer = Muon(learning_rate=1.0, adam_weight_decay=weight_decay)
+        optimizer._apply_weight_decay([variable])
+
+        self.assertAllClose(variable, expected_variable, rtol=1e-4, atol=1e-4)
+
+    def test_lr_adjust_none(self):
+        opt = Muon(rms_rate=None)
+        x = ops.ones((4, 4))
+        want = x
+        self.assertAllClose(opt.lr_adjust(x), want)
+
+    def test_lr_adjust_2d(self):
+        opt = Muon(rms_rate=0.2)
+        x = ops.ones((4, 2))
+        want = x * 0.2 * 2
+        self.assertAllClose(opt.lr_adjust(x), want)