Module checks the weight on load_weights (#337)

awni · web-flow · commit dff4a3833f45 · 2024-01-02T18:55:42.000-08:00
* update module to check weights on load, also fix docs and reorganize tests

* nits + rebase

* a few more docs updates for Module

* use manual module file

* comment
diff --git a/docs/src/_templates/module-base-class.rst b/docs/src/_templates/module-base-class.rst
@@ -0,0 +1,33 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. add toctree option to make autodoc generate the pages
+
+.. autoclass:: {{ objname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: .
+   {% for item in attributes %}
+      ~{{ fullname }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: .
+   {% for item in methods %}
+      {%- if item not in inherited_members and item != '__init__' %}
+      ~{{ fullname }}.{{ item }}
+      {%- endif -%}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/docs/src/python/nn.rst b/docs/src/python/nn.rst
@@ -170,14 +170,13 @@ In detail:
   :meth:`mlx.core.value_and_grad`
 
 .. autosummary::
-   :recursive:
    :toctree: _autosummary
 
    value_and_grad
-   Module
 
 .. toctree::
 
+   nn/module
    nn/layers
    nn/functions
    nn/losses
diff --git a/docs/src/python/nn/module.rst b/docs/src/python/nn/module.rst
@@ -0,0 +1,36 @@
+Module
+======
+
+.. currentmodule:: mlx.nn
+
+.. autoclass:: Module
+
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Module.training
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Module.apply
+      Module.apply_to_modules
+      Module.children
+      Module.eval
+      Module.filter_and_map
+      Module.freeze
+      Module.leaf_modules
+      Module.load_weights
+      Module.modules
+      Module.named_modules
+      Module.parameters
+      Module.save_weights
+      Module.train
+      Module.trainable_parameters
+      Module.unfreeze
+      Module.update
+      Module.update_modules
diff --git a/python/mlx/nn/layers/base.py b/python/mlx/nn/layers/base.py
@@ -1,7 +1,7 @@
 # Copyright © 2023 Apple Inc.
 
 import textwrap
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import mlx.core as mx
 from mlx.utils import tree_flatten, tree_unflatten
@@ -61,6 +61,7 @@ def __init__(self):
 
     @property
     def training(self):
+        """Boolean indicating if the model is in training mode."""
         return self._training
 
     def _extra_repr(self):
@@ -87,15 +88,83 @@ def __getattr__(self, key: str):
     def __setattr__(self, key: str, val: Any):
         self[key] = val
 
-    def load_weights(self, file: str):
+    def load_weights(
+        self,
+        file_or_weights: Union[str, List[Tuple[str, mx.array]]],
+        strict: bool = True,
+    ):
         """
-        Load and update the model's weights from a `.npz` file.
+        Update the model's weights from a ``.npz`` or a list.
+
+        Args:
+            file_or_weights (str or list(tuple(str, mx.array))): The path to
+                the weights ``.npz`` file or a list of pairs of parameter names
+                and arrays.
+            strict (bool, optional): If ``True`` then checks that the provided
+              weights exactly match the parameters of the model. Otherwise,
+              only the weights actually contained in the model are loaded and
+              shapes are not checked. Default: ``True``.
+
+        Example:
+
+            .. code-block:: python
+
+                import mlx.core as mx
+                import mlx.nn as nn
+                model = nn.Linear(10, 10)
+
+                # Load from file
+                model.load_weights("weights.npz")
+
+                # Load from list
+                weights = [
+                    ("weight", mx.random.uniform(shape=(10, 10))),
+                    ("bias",  mx.zeros((10,))),
+                ]
+                model.load_weights(weights)
+
+                # Missing weight
+                weights = [
+                    ("weight", mx.random.uniform(shape=(10, 10))),
+                ]
+
+                # Raises a ValueError exception
+                model.load_weights(weights)
+
+                # Ok, only updates the weight but not the bias
+                model.load_weights(weights, strict=False)
         """
-        self.update(tree_unflatten(list(mx.load(file).items())))
+        weights = file_or_weights
+        if isinstance(weights, str):
+            weights = list(mx.load(weights).items())
+
+        if strict:
+            new_weights = dict(weights)
+            curr_weights = dict(tree_flatten(self.parameters()))
+            if extras := (new_weights.keys() - curr_weights.keys()):
+                extras = " ".join(extras)
+                raise ValueError(f"Received parameters not in model: {extras}.")
+            if missing := (curr_weights.keys() - new_weights.keys()):
+                missing = " ".join(missing)
+                raise ValueError(f"Missing parameters: {missing}.")
+            for k, v in curr_weights.items():
+                v_new = new_weights[k]
+                if not isinstance(v_new, mx.array):
+                    raise ValueError(
+                        "Expected mx.array but received "
+                        f"{type(v_new)} for parameter {k}"
+                    )
+                if v_new.shape != v.shape:
+                    raise ValueError(
+                        f"Expected shape {v.shape} but received "
+                        f" shape {v_new.shape} for parameter {k}"
+                    )
+
+        self.update(tree_unflatten(weights))
 
     def save_weights(self, file: str):
         """
-        Save the model's weights to a `.npz` file.
+        Save the model's weights to a ``.npz`` file.
         """
         mx.savez(file, **dict(tree_flatten(self.parameters())))
 
@@ -351,23 +420,26 @@ def freeze(
         """Freeze the Module's parameters or some of them. Freezing a parameter means not
         computing gradients for it.
 
-        This function is idempotent ie freezing a frozen model is a noop.
+        This function is idempotent i.e. freezing a frozen model is a no-op.
+
+        Example:
+            For instance to only train the attention parameters from a Transformer:
 
-        For instance to only train the attention parameters from a transformer:
+            .. code-block:: python
 
-            model = ...
-            model.freeze()
-            model.apply_to_modules(lambda k, v: v.unfreeze() if k.endswith("attention") else None)
+                model = nn.Transformer()
+                model.freeze()
+                model.apply_to_modules(lambda k, v: v.unfreeze() if k.endswith("attention") else None)
 
         Args:
             recurse (bool, optional): If True then freeze the parameters of the
-                submodules as well (default: True).
+                submodules as well. Default: ``True``.
             keys (str or list[str], optional): If provided then only these
                 parameters will be frozen otherwise all the parameters of a
                 module. For instance freeze all biases by calling
                 ``module.freeze(keys="bias")``.
-            strict (bool, optional): If set to True validate that the passed keys exist
-                (default: False).
+            strict (bool, optional): If set to ``True`` validate that the passed keys exist.
+                Default: ``False``.
         """
 
         def _freeze_impl(_, m):
@@ -401,21 +473,25 @@ def unfreeze(
         This function is idempotent ie unfreezing a model that is not frozen is
         a noop.
 
-        For instance to only train the biases one can do:
+        Example:
 
-            model = ...
-            model.freeze()
-            model.unfreeze(keys="bias")
+            For instance to only train the biases of a Transformer one can do:
+
+            .. code-block:: python
+
+                model = nn.Transformer()
+                model.freeze()
+                model.unfreeze(keys="bias")
 
         Args:
             recurse (bool, optional): If True then unfreeze the parameters of the
-                submodules as well (default: True).
+                submodules as well. Default: ``True``.
             keys (str or list[str], optional): If provided then only these
                 parameters will be unfrozen otherwise all the parameters of a
                 module. For instance unfreeze all biases by calling
                 ``module.unfreeze(keys="bias")``.
-            strict (bool, optional): If set to True validate that the passed keys exist
-                (default: False).
+            strict (bool, optional): If set to ``True`` validate that the passed keys exist.
+                Default: ``False``.
         """
 
         def _unfreeze_impl(_, m):
@@ -432,10 +508,25 @@ def _unfreeze_impl(_, m):
             _unfreeze_impl("", self)
 
     def train(self, mode: bool = True):
+        """Set the model in or out of training mode.
+
+        Training mode only applies to certain layers. For example
+        :obj:`Dropout` applies a random mask in training mode, but is the
+        identity in evaluation mode.
+
+        Args:
+            mode (bool): Indicate if the model should be in training or
+                evaluation mode. Default: ``True``.
+        """
+
         def _set_train(_, m):
             m._training = mode
 
         self.apply_to_modules(_set_train)
 
     def eval(self):
+        """Set the model to evaluation mode.
+
+        See :func:`train`.
+        """
         self.train(False)
diff --git a/python/tests/test_losses.py b/python/tests/test_losses.py
diff --git a/python/tests/test_nn.py b/python/tests/test_nn.py