Add gradient clipping

parsiad · parsiad · commit dcd9bd569c3b · 2026-02-18T23:15:07.000-08:00
diff --git a/README.md b/README.md
@@ -52,5 +52,5 @@ sys.path.insert(0, os.path.expanduser("~/micrograd-pp/python"))
   * ☒ Stochastic Gradient Descent (SGD)
 * **Training**
   * ☐ Exponential moving average (EMA) of model weights
-  * ☐ Gradient clipping
+  * ☒ Gradient clipping
   * ☐ Learning rate schedules
diff --git a/src/micrograd_pp/__init__.py b/src/micrograd_pp/__init__.py
@@ -1,5 +1,6 @@
 from ._expr import Constant, Expr, Parameter, is_grad_enabled, maximum, no_grad, relu, zero_grads
 from ._func import cat, cross_entropy_loss, softmax
+from ._clip import clip_grad_norm_, clip_grad_value_
 from ._nn import (
     BatchNorm1d,
     Dropout,
@@ -34,6 +35,8 @@
     "Sequential",
     "SGD",
     "cat",
+    "clip_grad_norm_",
+    "clip_grad_value_",
     "cross_entropy_loss",
     "datasets",
     "eval",
diff --git a/src/micrograd_pp/_clip.py b/src/micrograd_pp/_clip.py
@@ -0,0 +1,84 @@
+import math
+from collections.abc import Iterable
+
+import numpy.typing as npt
+
+from ._expr import Expr
+from ._numpy import numpy as np
+
+
+def _get_grads(params: Iterable[Expr]) -> Iterable[npt.NDArray]:
+    return (param.grad for param in params if param.requires_grad)
+
+
+def clip_grad_value_(params: Iterable[Expr], clip_value: float) -> None:
+    """Clip gradient values in-place.
+
+    Parameters
+    ----------
+    params
+        Parameters whose gradients should be clipped
+    clip_value
+        Maximum absolute gradient value
+    """
+    if clip_value < 0.0:
+        msg = "clip_value must be non-negative"
+        raise ValueError(msg)
+    for grad in _get_grads(params):
+        np.clip(grad, -clip_value, clip_value, out=grad)
+
+
+def clip_grad_norm_(
+    params: Iterable[Expr],
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    eps: float = 1e-6,
+) -> float:
+    """Clip gradient norm in-place.
+
+    Parameters
+    ----------
+    params
+        Parameters whose gradients should be clipped
+    max_norm
+        Maximum allowed norm
+    norm_type
+        Type of p-norm to use. Supports ``math.inf`` for infinity norm.
+    error_if_nonfinite
+        If True, raises if the total norm is NaN or infinite
+    eps
+        Numerical stability term added to denominator
+    """
+    if max_norm < 0.0:
+        msg = "max_norm must be non-negative"
+        raise ValueError(msg)
+    if eps <= 0.0:
+        msg = "eps must be positive"
+        raise ValueError(msg)
+    if norm_type <= 0.0:
+        msg = "norm_type must be positive"
+        raise ValueError(msg)
+
+    grads = list(_get_grads(params))
+    if len(grads) == 0:
+        return 0.0
+
+    if math.isinf(norm_type):
+        total_norm = max(float(np.abs(grad).max()) for grad in grads)
+    else:
+        total_norm = 0.0
+        for grad in grads:
+            total_norm += float((np.abs(grad) ** norm_type).sum())
+        total_norm = total_norm ** (1.0 / norm_type)
+
+    if error_if_nonfinite and not np.isfinite(total_norm):
+        msg = f"The total norm of gradients is non-finite: {total_norm}"
+        raise RuntimeError(msg)
+
+    clip_coef = max_norm / (total_norm + eps)
+    if clip_coef < 1.0:
+        for grad in grads:
+            grad *= clip_coef
+
+    return total_norm
diff --git a/tests/test_clip.py b/tests/test_clip.py
@@ -0,0 +1,60 @@
+import pytest
+
+import micrograd_pp as mpp
+
+np = mpp.numpy
+
+
+@pytest.fixture(autouse=True)
+def run_before_and_after_tests():
+    np.random.seed(0)
+    yield
+
+
+def _set_grad(param: mpp.Expr, grad: np.ndarray) -> None:
+    param.zero_grad()
+    param.update_grad(lambda: grad)
+
+
+def test_clip_grad_value_clamps_each_element() -> None:
+    param = mpp.Parameter(np.array([0.0, 0.0, 0.0]))
+    _set_grad(param, np.array([-2.0, 0.25, 3.0]))
+
+    mpp.clip_grad_value_([param], clip_value=0.5)
+
+    np.testing.assert_allclose(param.grad, np.array([-0.5, 0.25, 0.5]))
+
+
+def test_clip_grad_norm_scales_all_grads_by_common_factor() -> None:
+    p1 = mpp.Parameter(np.zeros((2,)))
+    p2 = mpp.Parameter(np.zeros((1,)))
+    _set_grad(p1, np.array([3.0, 4.0]))
+    _set_grad(p2, np.array([12.0]))
+
+    total_norm = mpp.clip_grad_norm_([p1, p2], max_norm=6.5, norm_type=2.0)
+    scale = 6.5 / (13.0 + 1e-6)
+
+    np.testing.assert_allclose(total_norm, 13.0)
+    np.testing.assert_allclose(p1.grad, np.array([3.0, 4.0]) * scale, atol=1e-12, rtol=0.0)
+    np.testing.assert_allclose(p2.grad, np.array([12.0]) * scale, atol=1e-12, rtol=0.0)
+
+
+def test_clip_grad_norm_noop_when_within_threshold() -> None:
+    p1 = mpp.Parameter(np.zeros((2,)))
+    p2 = mpp.Parameter(np.zeros((1,)))
+    _set_grad(p1, np.array([3.0, 4.0]))
+    _set_grad(p2, np.array([12.0]))
+
+    total_norm = mpp.clip_grad_norm_([p1, p2], max_norm=13.1, norm_type=2.0)
+
+    np.testing.assert_allclose(total_norm, 13.0)
+    np.testing.assert_allclose(p1.grad, np.array([3.0, 4.0]), atol=1e-12, rtol=0.0)
+    np.testing.assert_allclose(p2.grad, np.array([12.0]), atol=1e-12, rtol=0.0)
+
+
+def test_clip_grad_norm_errors_on_nonfinite_if_requested() -> None:
+    p = mpp.Parameter(np.zeros((1,)))
+    _set_grad(p, np.array([np.inf]))
+
+    with pytest.raises(RuntimeError):
+        mpp.clip_grad_norm_([p], max_norm=1.0, error_if_nonfinite=True)
diff --git a/tests/test_mnist.py b/tests/test_mnist.py
@@ -59,7 +59,11 @@ def test_mnist(batch_sz: int = 64, n_epochs: int = 3):
             x = mpp.Constant(train_images[batch_index])
             y = train_labels[batch_index]
             loss = cross_entropy_loss(model(x), y)
-            loss.backward(opt=opt)
+            params = loss.params
+            mpp.zero_grads(params)
+            loss.backward()
+            mpp.clip_grad_norm_(params, max_norm=5.0)
+            opt.step(params)
         test_x = mpp.Constant(test_images)
         with mpp.eval(), mpp.no_grad():
             test_fx = model(test_x)
diff --git a/tests/test_opt.py b/tests/test_opt.py
@@ -12,17 +12,13 @@ def run_before_and_after_tests():
 
 
 @pytest.mark.parametrize(
-    ("opt_factory", "num_steps", "atol", "pass_opt_to_backward"),
-    [
-        (*cfg, pass_opt_to_backward)
-        for cfg in (
-            (lambda: mpp.SGD(lr=0.1), 150, 1e-8),
-            (lambda: mpp.AdamW(lr=0.2, weight_decay=0.0), 600, 1e-8),
-        )
-        for pass_opt_to_backward in (False, True)
-    ],
+    ("opt_factory", "num_steps", "atol"),
+    (
+        (lambda: mpp.SGD(lr=0.1), 150, 1e-8),
+        (lambda: mpp.AdamW(lr=0.2, weight_decay=0.0), 600, 1e-8),
+    ),
 )
-def test_mse(opt_factory, num_steps: int, atol: float, pass_opt_to_backward: bool):
+def test_mse(opt_factory, num_steps: int, atol: float):
     n = 10
     coef = np.random.randn(3, 1)
     coef_hat = np.random.randn(3, 1)
@@ -38,11 +34,6 @@ def test_mse(opt_factory, num_steps: int, atol: float, pass_opt_to_backward: boo
     for _ in range(num_steps):
         y_pred_ = x_ @ coef_hat_
         mse = ((y_pred_ - y_) ** 2).sum() / n
-        if pass_opt_to_backward:
-            mse.backward(opt=opt)  # Automatically handles zeroing gradients and updating the optimizer state
-        else:
-            mpp.zero_grads(mse.params)
-            mse.backward()
-            opt.step(mse.params)
+        mse.backward(opt=opt)  # Automatically handles zeroing gradients and updating the optimizer state
 
     np.testing.assert_allclose(coef, coef_hat, rtol=0.0, atol=atol)