Allow initial guesses to be passed to LSMR (#251)

pnkraemer · web-flow · commit f638e7fda46b · 2025-08-22T16:31:55.000+02:00
* Make x0 explicit in LSMR

* Make damp and x0 non-differentiable to postpone gradient derivations

* Assert the LSMR starting vector is used correctly

* Avoid linalg.lstsq-equivalence test for wide matrices

* Improve formatting

* Run equivalence test in double precision because scipy uses it
diff --git a/matfree/backend/testing.py b/matfree/backend/testing.py
@@ -24,5 +24,9 @@ def warns(warning, /):
     return pytest.warns(warning)
 
 
+def filterwarnings(warning, /):
+    return pytest.mark.filterwarnings(warning)
+
+
 def case():
     return pytest_cases.case()
diff --git a/matfree/lstsq.py b/matfree/lstsq.py
@@ -23,7 +23,6 @@ def lsmr(
     maxiter: int = 1_000_000,
     while_loop: Callable = control_flow.while_loop,
     custom_vjp: bool = True,
-    damp: float = 0.0,
 ):
     """Construct an experimental implementation of LSMR.
 
@@ -78,28 +77,48 @@ class State:
     # more often than not, the matvec is defined after the LSMR
     # solver has been constructed. So it's part of the run()
     # function, not the LSMR constructor.
-    def run(vecmat, b, *vecmat_args):
+    def run(vecmat, b, *vecmat_args, x0=None, damp=0.0):
+        x_like = func.eval_shape(vecmat, b, *vecmat_args)
+        (ncols,) = x_like.shape
+        x = x0 if x0 is not None else np.zeros(ncols, dtype=b.dtype)
+
+        # Combine the lstsq_fun wiht a closure convert, because
+        # typically, vecmat is a lambda function and if we want to
+        # have explicit parameter-VJPs, all parameters need to be explicit.
+        # This means that in this function here, we always use lstsq_public
+        # (and return lstsq_public!), but provide lstsq_fun with the custom VJP.
+        # Thereby, the function that gets the custom VJP is, from now on, only
+        # called after a previous call to closure convert which 'fixes' all namespaces.
+        vecmat_closure, args = func.closure_convert(
+            lambda s: vecmat(s, *vecmat_args), b
+        )
+        return _run(vecmat_closure, b, args, x, damp)
+
+    def _run(vecmat, b, vecmat_args, x0, damp):
         def vecmat_noargs(v):
             return vecmat(v, *vecmat_args)
 
-        (ncols,) = func.eval_shape(vecmat, b, *vecmat_args).shape
+        def matvec_noargs(w):
+            matvec = func.linear_transpose(vecmat_noargs, b)
+            (Aw,) = matvec(w)
+            return Aw
 
-        state, normb, matvec_noargs = init(vecmat_noargs, b, ncols=ncols)
-        step_fun = make_step(matvec_noargs, normb=normb)
+        state, normb = init(matvec_noargs, b, x0)
+        step_fun = make_step(matvec_noargs, normb=normb, damp=damp)
         cond_fun = make_cond_fun()
         state = while_loop(cond_fun, step_fun, state)
         stats_ = stats(state)
         return state.x, stats_
 
-    def init(vecmat, b, ncols: int):
+    def init(matvec_noargs, b, x):
         normb = linalg.vector_norm(b)
-        x = np.zeros(ncols, dtype=b.dtype)
-        beta = normb
 
-        u = b
+        Ax, vecmat_noargs = func.vjp(matvec_noargs, x)
+        u = b - Ax
+        beta = linalg.vector_norm(u)
         u = u / np.where(beta > 0, beta, 1.0)
 
-        v, matvec = func.vjp(vecmat, u)
+        (v,) = vecmat_noargs(u)
         alpha = linalg.vector_norm(v)
         v = v / np.where(alpha > 0, alpha, 1)
         v = np.where(beta == 0, np.zeros_like(v), v)
@@ -115,7 +134,7 @@ def init(vecmat, b, ncols: int):
         sbar = 0.0
 
         h = v
-        hbar = np.zeros(ncols, dtype=b.dtype)
+        hbar = np.zeros_like(x)
 
         # Initialize variables for estimation of ||r||.
 
@@ -176,9 +195,9 @@ def init(vecmat, b, ncols: int):
             istop=0,
         )
         state = tree.tree_map(np.asarray, state)
-        return state, normb, lambda *a: matvec(*a)[0]
+        return state, normb
 
-    def make_step(matvec, normb: float) -> Callable:
+    def make_step(matvec, normb: float, damp: float) -> Callable:
         def step(state: State) -> State:
             # Perform the next step of the bidiagonalization
 
@@ -338,7 +357,7 @@ def stats(state: State) -> dict:
         }
 
     if custom_vjp:
-        return _lstsq_custom_vjp(run)
+        _run = _lstsq_custom_vjp(_run)
     return run
 
 
@@ -380,32 +399,23 @@ def _sym_ortho_3(a, b):
 
 
 def _lstsq_custom_vjp(lstsq_fun: Callable) -> Callable:
-    # Combine the lstsq_fun wiht a closure convert, because
-    # typically, vecmat is a lambda function and if we want to
-    # have explicit parameter-VJPs, all parameters need to be explicit.
-    # This means that in this function here, we always use lstsq_public
-    # (and return lstsq_public!), but provide lstsq_fun with the custom VJP.
-    # Thereby, the function that gets the custom VJP is, from now on, only
-    # called after a previous call to closure convert which 'fixes' all namespaces.
-    def lstsq_public(vecmat, rhs, *vecmat_args):
-        vecmat_, args = func.closure_convert(lambda s: vecmat(s, *vecmat_args), rhs)
-        return lstsq_fun(vecmat_, rhs, *args)
-
-    def lstsq_fwd(vecmat, rhs, *vecmat_args):
-        x, stats = lstsq_public(vecmat, rhs, *vecmat_args)
-        cache = {"x": x, "rhs": rhs, "vecmat_args": vecmat_args}
+    def lstsq_fwd(vecmat, rhs, vecmat_args, x0, damp):
+        x, stats = lstsq_fun(vecmat, rhs, vecmat_args, x0, damp)
+        cache = {"x": x, "rhs": rhs, "vecmat_args": vecmat_args, "x0": x0, "damp": damp}
         return (x, stats), cache
 
-    def lstsq_rev(vecmat, cache, dmu_dx):
+    def lstsq_rev(vecmat, x0, damp, cache, dmu_dx):
         dmu_dx, _ = dmu_dx
         x_like = func.eval_shape(vecmat, cache["rhs"], *cache["vecmat_args"])
         if cache["rhs"].size <= x_like.size:
-            return lstsq_rev_wide(vecmat, cache, dmu_dx)
-        return lstsq_rev_tall(vecmat, cache, dmu_dx)
+            return lstsq_rev_wide(vecmat, x0, damp, cache, dmu_dx)
+        return lstsq_rev_tall(vecmat, x0, damp, cache, dmu_dx)
 
-    def lstsq_rev_tall(vecmat, cache, dmu_dx):
+    def lstsq_rev_tall(vecmat, x0, damp, cache, dmu_dx):
         x = cache["x"]
         rhs = cache["rhs"]
+        x0 = cache["x0"]
+        damp = cache["damp"]
         vecmat_args = cache["vecmat_args"]
 
         def vecmat_noargs(z):
@@ -414,11 +424,12 @@ def vecmat_noargs(z):
         def matvec_noargs(z):
             return func.vjp(vecmat_noargs, rhs)[1](z)[0]
 
-        dmu_db = lstsq_public(matvec_noargs, dmu_dx)[0]
-        p = lstsq_public(vecmat_noargs, -dmu_db)[0]
+        x0_rev = np.zeros_like(rhs)
+        dmu_db = lstsq_fun(matvec_noargs, dmu_dx, (), x0_rev, damp)[0]
+        p = lstsq_fun(vecmat_noargs, -dmu_db, (), x0, damp)[0]
 
-        Ax_minus_b = matvec_noargs(x) - rhs
         Ap = matvec_noargs(p)
+        Ax_minus_b = matvec_noargs(x) - rhs
 
         @func.grad
         def grad_theta(theta):
@@ -427,9 +438,9 @@ def grad_theta(theta):
             return linalg.inner(rA, p) + linalg.inner(pAA, x)
 
         dmu_dparams = grad_theta(vecmat_args)
-        return dmu_db, *dmu_dparams
+        return dmu_db, dmu_dparams
 
-    def lstsq_rev_wide(vecmat, cache, dmu_dx):
+    def lstsq_rev_wide(vecmat, x0, damp, cache, dmu_dx):
         x = cache["x"]
         rhs = cache["rhs"]
         vecmat_args = cache["vecmat_args"]
@@ -441,11 +452,12 @@ def matvec_noargs(z):
             return func.linear_transpose(vecmat_noargs, rhs)(z)[0]
 
         # Compute the Lagrange multiplier from the forward pass
-        y = lstsq_public(matvec_noargs, x)[0]
+        x0_rev = np.zeros_like(rhs)
+        y = lstsq_fun(matvec_noargs, x, (), x0_rev, damp)[0]
 
         # Compute the two solutions of the backward pass
-        p = dmu_dx - lstsq_public(vecmat_noargs, matvec_noargs(dmu_dx))[0]
-        q = lstsq_public(matvec_noargs, p - dmu_dx)[0]
+        p = dmu_dx - lstsq_fun(vecmat_noargs, matvec_noargs(dmu_dx), (), x0, damp)[0]
+        q = lstsq_fun(matvec_noargs, p - dmu_dx, (), x0_rev, damp)[0]
 
         @func.grad
         def grad_theta(theta):
@@ -455,8 +467,8 @@ def grad_theta(theta):
 
         grad_vecmat_args = grad_theta(vecmat_args)
         grad_rhs = -q
-        return grad_rhs, *grad_vecmat_args
+        return grad_rhs, grad_vecmat_args
 
-    lstsq_fun = func.custom_vjp(lstsq_fun, nondiff_argnums=(0,))
+    lstsq_fun = func.custom_vjp(lstsq_fun, nondiff_argnums=(0, 3, 4))
     lstsq_fun.defvjp(lstsq_fwd, lstsq_rev)  # type: ignore
-    return lstsq_public
+    return lstsq_fun
diff --git a/tests/test_lstsq.py b/tests/test_lstsq.py
@@ -1,13 +1,7 @@
 """Tests for least-squares functionality."""
 
 from matfree import lstsq, test_util
-from matfree.backend import func, linalg, prng, testing
-from matfree.backend.typing import Callable
-
-
-@testing.case()
-def case_lstsq_lsmr() -> Callable:
-    return lstsq.lsmr(atol=1e-5, btol=1e-5, ctol=1e-5)
+from matfree.backend import config, func, linalg, np, prng, testing
 
 
 def case_A_shape_wide() -> tuple:
@@ -22,9 +16,9 @@ def case_A_shape_square() -> tuple:
     return 3, 3
 
 
-@testing.parametrize_with_cases("lstsq_fun", cases=".", prefix="case_lstsq_")
 @testing.parametrize_with_cases("A_shape", cases=".", prefix="case_A_shape_")
-def test_value_and_grad_matches_numpy_lstsq(lstsq_fun: Callable, A_shape: tuple):
+@testing.parametrize("provide_x0", [True, False])
+def test_value_and_grad_matches_numpy_lstsq(A_shape: tuple, provide_x0: bool):
     key = prng.prng_key(1)
 
     key, subkey = prng.split(key, 2)
@@ -34,6 +28,13 @@ def test_value_and_grad_matches_numpy_lstsq(lstsq_fun: Callable, A_shape: tuple)
     key, subkey = prng.split(key, num=2)
     dsol = prng.normal(subkey, shape=(A_shape[1],))
 
+    # If the matrix is wide, any nonzero initial guess affects the optimal solution
+    # so the comparison to np.linalg.lstsq() is no longer valid. Thus, the caveat below.
+    key, subkey = prng.split(key, num=2)
+    is_wide = A_shape[1] > A_shape[0]
+    x0_suggestion = prng.normal(subkey, shape=(A_shape[1],))
+    x0 = x0_suggestion if provide_x0 and not is_wide else None
+
     def lstsq_jnp(a, b):
         sol, *_ = linalg.lstsq(a, b)
         return sol
@@ -46,12 +47,52 @@ def vecmat(vector, p_as_list):
         return p.T @ vector
 
     def lstsq_matfree(a, b):
-        sol, _ = lstsq_fun(vecmat, a, b)
+        lsmr = lstsq.lsmr(atol=1e-5, btol=1e-5, ctol=1e-5)
+        sol, _ = lsmr(vecmat, a, b, x0=x0)
         return sol
 
     received, received_vjp = func.vjp(lstsq_matfree, rhs, [matrix])
     drhs2, [dmatrix2] = received_vjp(dsol)  # mind the order of rhs & matrix
 
     test_util.assert_allclose(received, expected)
-    test_util.assert_allclose(dmatrix1, dmatrix2)
     test_util.assert_allclose(drhs1, drhs2)
+    test_util.assert_allclose(dmatrix1, dmatrix2)
+
+
+@testing.parametrize_with_cases("A_shape", cases=".", prefix="case_A_shape_")
+@testing.filterwarnings("ignore: overflow encountered in")  # SciPy LSMR warns...
+def test_output_matches_original_scipy_lsmr(A_shape: tuple):
+    """Assert that the implementation of scipy's LSMR is matched exactly."""
+    import numpy as onp  # noqa: ICN001
+    import scipy.sparse.linalg
+
+    # Scipy uses double precision, so we emulate this behaviour
+    config.update("jax_enable_x64", True)
+
+    key = prng.prng_key(1)
+    key, subkey = prng.split(key, 2)
+    matrix = prng.normal(subkey, shape=A_shape)
+    key, subkey = prng.split(key, 2)
+    rhs = prng.normal(subkey, shape=(A_shape[0],))
+    key, subkey = prng.split(key, num=2)
+    x0 = prng.normal(subkey, shape=(A_shape[1],))
+    key, subkey = prng.split(key, num=2)
+    damp = (prng.uniform(subkey, shape=())) ** 2
+
+    # Our code
+    lsmr = lstsq.lsmr(atol=1e-5, btol=1e-5, ctol=1e-5)
+    sol, _ = lsmr(lambda v: matrix.T @ v, rhs, damp=damp, x0=x0)
+
+    # Original NumPy
+    matrix = onp.asarray(matrix)
+    rhs = onp.asarray(rhs)
+    x0 = onp.asarray(x0)
+    damp = onp.asarray(damp)
+    sol2, *_ = scipy.sparse.linalg.lsmr(
+        matrix, rhs, atol=1e-5, btol=1e-5, conlim=1e5, damp=damp, x0=x0
+    )
+
+    assert np.allclose(sol, np.asarray(sol2))
+
+    # Scipy uses double precision, so we emulate this behaviour
+    config.update("jax_enable_x64", False)