EleutherAI · lintangsutawika · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
@@ -111,7 +111,11 @@ Logging Arguments
 
 - **git_hash**: str
 
+<<<<<<< HEAD
+    Default = 02687a8
+=======
     Default = 31cb364
+>>>>>>> e5a7ea71e96eeada636c9612036dc85e886d973d
 
     current git hash of repository
 
@@ -460,6 +464,7 @@ Model Arguments
     Default = 0.02
 
     Standard deviation of the zero mean normal distribution used for weight initialization.
+    When using muP this is the base std
 
 
 
@@ -671,6 +676,7 @@ Optimizer Arguments
     Default = None
 
     Max Learning rate during training
+    When using muP, this is the base lr
 
 
 
@@ -1529,7 +1535,7 @@ Training Arguments
 
     Default = False
 
-    Whether to use Microsoft's Mup https://github.com/microsoft/mup
+    Whether to use muP
 
 
 
@@ -1557,52 +1563,28 @@ Training Arguments
 
 
 
-- **mup_init_scale**: float
+- **mup_emb**: int
 
-    Default = 1.0
-
-    Initialization scale: All the parameters are multiplied by this value
-
-
-
-- **mup_attn_temp**: float
-
-    Default = 1.0
-
-    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
-
-
-
-- **mup_output_temp**: float
-
-    Default = 1.0
-
-    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
-    produces the distribution over output tokens.
-
-
-
-- **mup_embedding_mult**: float
-
-    Default = 1.0
+    Default = 1
 
-    Scalar by which we multiply the output of the embedding layer
+    Embedding output multiplier
 
 
 
-- **mup_rp_embedding_mult**: float
+- **mup_m_width**: int
 
-    Default = 1.0
+    Default = 1
 
-    Scalar by which we multiply vectors representing relative position
+    Manually set the layer width multiplier (d_model/d_model,base)
 
 
 
-- **mup_width_scale**: int
+- **mup_d_model_base**: int
 
-    Default = 2
+    Default = 64
 
-    What to scale width by when creating the delta model for mup
+    d_model,base
+    Proxy (base) model's layer width
 
 
 

@@ -37,6 +37,7 @@ def __init__(
         use_checkpoint_lr_scheduler=True,
         override_lr_scheduler=False,
         use_mup=False,
+        mup_m_width=1,
     ):
 
         # Class values.
@@ -51,6 +52,7 @@ def __init__(
         self.override_lr_scheduler = override_lr_scheduler
         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
         self.use_mup = use_mup
+        self.mup_m_width = mup_m_width
         if self.override_lr_scheduler:
             assert not self.use_checkpoint_lr_scheduler, (
                 "both override and " "use-checkpoint are set."
@@ -95,8 +97,8 @@ def step(self, step_num=None):
         self.num_iters = step_num
         new_lr = self.get_lr()
         for group in self.optimizer.param_groups:
-            if self.use_mup and "width_mult" in group:
-                group["lr"] = new_lr / group["width_mult"]
+            if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True:
+                group["lr"] = new_lr / self.mup_m_width
             else:
                 group["lr"] = new_lr
 

@@ -175,6 +175,7 @@ def init_specs(self):
         # Embedding layer
         # input will be (input_ids, position_ids, attention_mask)
 
+        # TODO Initilized weights here should not be divided by m_width
         if weight_tying:
             self.specs.append(
                 TiedLayerSpec(
@@ -268,16 +269,9 @@ def init_specs(self):
 
         def _logits_helper(embedding, lm_output):
             """Just a wrapper to massage inputs/outputs from pipeline."""
-            if self.neox_args.use_mup:
-                # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
-                # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
-                lm_output = (
-                    lm_output
-                    / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
-                )
 
             logits = parallel_lm_logits(
-                lm_output, embedding.word_embeddings_weight, self.parallel_output
+                lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args
             )
             return logits
 

@@ -16,41 +16,22 @@
 
 import torch
 
-try:
-    import mup
-except ImportError:
-    pass
 
-
-def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0):
+def init_method_normal(sigma):
     """Init method based on N(0, sigma)."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=sigma)
-            with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 
     return init_
 
 
-def scaled_init_method_normal(
-    sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0
-):
+def scaled_init_method_normal(sigma, num_layers):
     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
     std = sigma / math.sqrt(2.0 * num_layers)
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
-            with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
 
     return init_
 
@@ -87,12 +68,12 @@ def _orthogonal(tensor, gain=1):
     return tensor
 
 
-def orthogonal_init_method(n_layers=1, use_mup=False, mup_init_scale=1.0):
+def orthogonal_init_method(n_layers=1, mup_m_width=1.0):
     """Fills the input Tensor with a (semi) orthogonal matrix, as described in
     Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013)
     Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)"""
 
-    if use_mup:
+    if mup_m_width != 1:
         raise ValueError(
             "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error"
         )
@@ -103,105 +84,91 @@ def init_(tensor):
     return init_
 
 
-def xavier_uniform_init_method(use_mup_outer=False, mup_init_scale=1.0):
+def xavier_uniform_init_method(mup_m_width=1.0):
     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.xavier_uniform_(tensor)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.xavier_uniform_(tensor)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.xavier_uniform_(tensor)
+                init_weight.div_(mup_m_width)
+        return init_weight
 
     return init_
 
 
-def xavier_normal_init_method(use_mup_outer=False, mup_init_scale=1.0):
+def xavier_normal_init_method(mup_m_width=1.0):
     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.xavier_normal_(tensor)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.xavier_normal_(tensor)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.xavier_normal_(tensor)
+                init_weight.div_(mup_m_width)
+        return init_weight
 
     return init_
 
 
-def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0):
+def small_init_init_method(dim, mup_m_width=1.0):
     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
     the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
     std = math.sqrt(2 / (5 * dim))
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+                init_weight.div_(mup_m_width)
+        return init_weight
 
     return init_
 
 
-def wang_init_method(n_layers, dim, use_mup_outer=False, mup_init_scale=1.0):
+def wang_init_method(n_layers, dim, mup_m_width=1.0):
     std = 2 / n_layers / math.sqrt(dim)
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
+                init_weight.div_(mup_m_width)
+        return init_weight
+
     return init_
 
 
 def get_init_methods(args):
 
-    if args.use_mup:
-        try:
-            import mup
-        except ModuleNotFoundError:
-            print("Please install mup https://github.com/microsoft/mup")
-            raise Exception
-
     def _get(name):
         if name == "normal":
             return init_method_normal(
-                args.init_method_std, args.use_mup, args.mup_init_scale
+                sigma=args.init_method_std/math.sqrt(args.mup_m_width)
             )
         elif name == "scaled_normal":
             return scaled_init_method_normal(
-                args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale
+                sigma=args.init_method_std/math.sqrt(args.mup_m_width),
+                num_layers=args.num_layers
             )
         elif name == "orthogonal":
-            return orthogonal_init_method(args.use_mup, args.mup_init_scale)
+            return orthogonal_init_method(args.mup_m_width)
         elif name == "scaled_orthogonal":
             return orthogonal_init_method(
-                args.num_layers, args.use_mup, args.mup_init_scale
+                args.num_layers, args.mup_m_width
             )
         elif name == "xavier_uniform":
-            return xavier_uniform_init_method(args.use_mup, args.mup_init_scale)
+            return xavier_uniform_init_method(args.mup_m_width)
         elif name == "xavier_normal":
-            return xavier_normal_init_method(args.use_mup, args.mup_init_scale)
+            return xavier_normal_init_method(args.mup_m_width)
         elif name == "wang_init":
             return wang_init_method(
-                args.num_layers, args.hidden_size, args.use_mup, args.mup_init_scale
+                args.num_layers, args.hidden_size, args.mup_m_width
             )
         elif name == "small_init":
             return small_init_init_method(
-                args.hidden_size, args.use_mup, args.mup_init_scale
+                args.hidden_size, args.mup_m_width
             )
         else:
             raise NotImplementedError(f"Unknown init method {name}")

@@ -306,13 +306,13 @@ def __init__(
         )
 
         coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = max(1, self.layer_number)
-            self.norm_factor *= coeff
-
         if neox_args.use_mup:
             self.norm_factor = self.hidden_size_per_attention_head
+        else:
+            self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+            if self.apply_query_key_layer_scaling:
+                coeff = max(1, self.layer_number)
+                self.norm_factor *= coeff
 
         self.rpe = rpe
 
@@ -960,7 +960,7 @@ def forward(self, args):
         return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None):
     """LM logits using word embedding weights."""
     # Parallel logits.
     input_parallel = mpu.copy_to_model_parallel_region(input_)
@@ -971,6 +971,9 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
     else:
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
+    if args is not None and args.use_mup:
+        logits_parallel /= args.mup_m_width
+
     # Gather if needed.
     if parallel_output:
         return logits_parallel