Linearisations return conditionals (#830)

pnkraemer · web-flow · commit 4f06883edb7d · 2025-09-08T11:21:17.000+02:00
* Linearisations return conditionals

* Fix the tests
diff --git a/probdiffeq/backend/tree_util.py b/probdiffeq/backend/tree_util.py
@@ -26,3 +26,7 @@ def tree_all(tree, /):
 
 def ravel_pytree(tree, /):
     return jax.flatten_util.ravel_pytree(tree)
+
+
+def register_dataclass(datacls):
+    return jax.tree_util.register_dataclass(datacls)
diff --git a/probdiffeq/impl/_conditional.py b/probdiffeq/impl/_conditional.py
@@ -9,16 +9,18 @@
     tree_util,
 )
 from probdiffeq.backend import numpy as np
-from probdiffeq.backend.typing import Any, Array
-from probdiffeq.impl import _normal
+from probdiffeq.backend.typing import Array
+from probdiffeq.impl import _normal, _stats
 from probdiffeq.util import cholesky_util
 
 
-class Conditional(containers.NamedTuple):
+@tree_util.register_dataclass
+@containers.dataclass
+class Conditional:
     """Conditional distributions."""
 
-    matmul: Array
-    noise: Any  # Usually a random-variable type
+    A: Array
+    noise: _normal.Normal
 
 
 class ConditionalBackend(abc.ABC):
@@ -57,6 +59,10 @@ def preconditioner_apply(self, cond, p, p_inv, /):
     def to_derivative(self, i, standard_deviation):
         raise NotImplementedError
 
+    @abc.abstractmethod
+    def rescale_noise(self, cond, scale):
+        raise NotImplementedError
+
 
 class DenseConditional(ConditionalBackend):
     def __init__(self, ode_shape, num_derivatives, unravel, flat_shape):
@@ -65,19 +71,17 @@ def __init__(self, ode_shape, num_derivatives, unravel, flat_shape):
         self.unravel = unravel
         self.flat_shape = flat_shape
 
-    def apply(self, x, conditional, /):
-        matrix, noise = conditional
-        return _normal.Normal(matrix @ x + noise.mean, noise.cholesky)
+    def apply(self, x, cond, /):
+        return _normal.Normal(cond.A @ x + cond.noise.mean, cond.noise.cholesky)
 
-    def marginalise(self, rv, conditional, /):
-        matmul, noise = conditional
-        R_stack = ((matmul @ rv.cholesky).T, noise.cholesky.T)
+    def marginalise(self, rv, cond, /):
+        R_stack = ((cond.A @ rv.cholesky).T, cond.noise.cholesky.T)
         cholesky_new = cholesky_util.sum_of_sqrtm_factors(R_stack=R_stack).T
-        return _normal.Normal(matmul @ rv.mean + noise.mean, cholesky_new)
+        return _normal.Normal(cond.A @ rv.mean + cond.noise.mean, cholesky_new)
 
     def merge(self, cond1, cond2, /):
-        A, b = cond1
-        C, d = cond2
+        A, b = cond1.A, cond1.noise
+        C, d = cond2.A, cond2.noise
 
         g = A @ C
         xi = A @ d.mean + b.mean
@@ -86,8 +90,8 @@ def merge(self, cond1, cond2, /):
         )
         return Conditional(g, _normal.Normal(xi, Xi.T))
 
-    def revert(self, rv, conditional, /):
-        matrix, noise = conditional
+    def revert(self, rv, cond, /):
+        matrix, noise = cond.A, cond.noise
         mean, cholesky = rv.mean, rv.cholesky
 
         # QR-decomposition
@@ -133,10 +137,9 @@ def discretise(dt):
         return discretise
 
     def preconditioner_apply(self, cond, p, p_inv, /):
-        A, noise = cond
         normal = _normal.DenseNormal(ode_shape=self.ode_shape)
-        noise = normal.preconditioner_apply(noise, p)
-        A = p[:, None] * A * p_inv[None, :]
+        noise = normal.preconditioner_apply(cond.noise, p)
+        A = p[:, None] * cond.A * p_inv[None, :]
         return Conditional(A, noise)
 
     def to_derivative(self, i, standard_deviation):
@@ -153,23 +156,30 @@ def select(a):
         noise = _normal.Normal(bias, standard_deviation * eye)
         return Conditional(linop, noise)
 
+    def rescale_noise(self, cond, scale):
+        A = cond.A
+        noise = cond.noise
+        stats = _stats.DenseStats(ode_shape=self.ode_shape, unravel=self.unravel)
+        noise_new = stats.rescale_cholesky(noise, scale)
+        return Conditional(A, noise_new)
+
 
 class IsotropicConditional(ConditionalBackend):
     def __init__(self, *, ode_shape, num_derivatives, unravel_tree):
         self.ode_shape = ode_shape
         self.num_derivatives = num_derivatives
         self.unravel_tree = unravel_tree
 
-    def apply(self, x, conditional, /):
-        A, noise = conditional
+    def apply(self, x, cond, /):
+        A, noise = cond.A, cond.noise
         # if the gain is qoi-to-hidden, the data is a (d,) array.
         # this is problematic for the isotropic model unless we explicitly broadcast.
         if np.ndim(x) == 1:
             x = x[None, :]
         return _normal.Normal(A @ x + noise.mean, noise.cholesky)
 
-    def marginalise(self, rv, conditional, /):
-        matrix, noise = conditional
+    def marginalise(self, rv, cond, /):
+        matrix, noise = cond.A, cond.noise
 
         mean = matrix @ rv.mean + noise.mean
 
@@ -178,8 +188,8 @@ def marginalise(self, rv, conditional, /):
         return _normal.Normal(mean, cholesky)
 
     def merge(self, cond1, cond2, /):
-        A, b = cond1
-        C, d = cond2
+        A, b = cond1.A, cond1.noise
+        C, d = cond2.A, cond2.noise
 
         g = A @ C
         xi = A @ d.mean + b.mean
@@ -189,8 +199,8 @@ def merge(self, cond1, cond2, /):
         noise = _normal.Normal(xi, Xi)
         return Conditional(g, noise)
 
-    def revert(self, rv, conditional, /):
-        matrix, noise = conditional
+    def revert(self, rv, cond, /):
+        matrix, noise = cond.A, cond.noise
 
         r_ext_p, (r_bw_p, gain) = cholesky_util.revert_conditional(
             R_X_F=(matrix @ rv.cholesky).T, R_X=rv.cholesky.T, R_YX=noise.cholesky.T
@@ -225,7 +235,7 @@ def discretise(dt):
         return discretise
 
     def preconditioner_apply(self, cond, p, p_inv, /):
-        A, noise = cond
+        A, noise = cond.A, cond.noise
 
         A_new = p[:, None] * A * p_inv[None, :]
 
@@ -245,25 +255,34 @@ def select(a):
 
         return Conditional(linop, noise)
 
+    def rescale_noise(self, cond, scale):
+        A = cond.A
+        noise = cond.noise
+        stats = _stats.IsotropicStats(
+            ode_shape=self.ode_shape, unravel=self.unravel_tree
+        )
+        noise_new = stats.rescale_cholesky(noise, scale)
+        return Conditional(A, noise_new)
+
 
 class BlockDiagConditional(ConditionalBackend):
     def __init__(self, *, ode_shape, num_derivatives, unravel_tree):
         self.ode_shape = ode_shape
         self.num_derivatives = num_derivatives
         self.unravel_tree = unravel_tree
 
-    def apply(self, x, conditional, /):
+    def apply(self, x, cond, /):
         if np.ndim(x) == 1:
             x = x[..., None]
 
         def apply_unbatch(m, s, n):
             return _normal.Normal(m @ s + n.mean, n.cholesky)
 
-        matrix, noise = conditional
+        matrix, noise = cond.A, cond.noise
         return functools.vmap(apply_unbatch)(matrix, x, noise)
 
-    def marginalise(self, rv, conditional, /):
-        matrix, noise = conditional
+    def marginalise(self, rv, cond, /):
+        matrix, noise = cond.A, cond.noise
         assert matrix.ndim == 3
 
         mean = np.einsum("ijk,ik->ij", matrix, rv.mean) + noise.mean
@@ -275,8 +294,8 @@ def marginalise(self, rv, conditional, /):
         return _normal.Normal(mean, _transpose(cholesky))
 
     def merge(self, cond1, cond2, /):
-        A, b = cond1
-        C, d = cond2
+        A, b = cond1.A, cond1.noise
+        C, d = cond2.A, cond2.noise
 
         g = A @ C
         xi = (A @ d.mean[..., None])[..., 0] + b.mean
@@ -286,8 +305,8 @@ def merge(self, cond1, cond2, /):
         noise = _normal.Normal(xi, Xi)
         return Conditional(g, noise)
 
-    def revert(self, rv, conditional, /):
-        A, noise = conditional
+    def revert(self, rv, cond, /):
+        A, noise = cond.A, cond.noise
         rv_chol_upper = np.transpose(rv.cholesky, axes=(0, 2, 1))
         noise_chol_upper = np.transpose(noise.cholesky, axes=(0, 2, 1))
         A_rv_chol_upper = np.transpose(A @ rv.cholesky, axes=(0, 2, 1))
@@ -329,11 +348,10 @@ def discretise(dt):
         return discretise
 
     def preconditioner_apply(self, cond, p, p_inv, /):
-        A, noise = cond
-        A_new = p[None, :, None] * A * p_inv[None, None, :]
+        A_new = p[None, :, None] * cond.A * p_inv[None, None, :]
 
         normal = _normal.BlockDiagNormal(ode_shape=self.ode_shape)
-        noise = normal.preconditioner_apply(noise, p)
+        noise = normal.preconditioner_apply(cond.noise, p)
         return Conditional(A_new, noise)
 
     def to_derivative(self, i, standard_deviation):
@@ -349,6 +367,15 @@ def select(a):
 
         return Conditional(linop, noise)
 
+    def rescale_noise(self, cond, scale):
+        A = cond.A
+        noise = cond.noise
+        stats = _stats.BlockDiagStats(
+            ode_shape=self.ode_shape, unravel=self.unravel_tree
+        )
+        noise_new = stats.rescale_cholesky(noise, scale)
+        return Conditional(A, noise_new)
+
 
 def _transpose(matrix):
     return np.transpose(matrix, axes=(0, 2, 1))
diff --git a/probdiffeq/impl/_linearise.py b/probdiffeq/impl/_linearise.py
@@ -1,7 +1,7 @@
 from probdiffeq.backend import abc, functools
 from probdiffeq.backend import numpy as np
 from probdiffeq.backend.typing import Callable
-from probdiffeq.impl import _normal
+from probdiffeq.impl import _conditional, _normal
 from probdiffeq.util import cholesky_util
 
 
@@ -48,7 +48,7 @@ def linearise_fun_wrapped(fun, rv):
             )
             cov_lower = damp * np.eye(len(fx))
             bias = _normal.Normal(-fx, cov_lower)
-            return linop, bias
+            return _conditional.Conditional(linop, bias)
 
         return linearise_fun_wrapped
 
@@ -73,7 +73,7 @@ def A(x):
             linop = _jac_materialize(lambda v, _p: A(v), inputs=mean)
             cov_lower = damp * np.eye(len(fx))
             bias = _normal.Normal(-fx, cov_lower)
-            return linop, bias
+            return _conditional.Conditional(linop, bias)
 
         return new
 
@@ -109,7 +109,7 @@ def A(x):
             stack = np.concatenate((cov_lower.T, damping.T))
             cov_lower = cholesky_util.triu_via_qr(stack).T
             bias = _normal.Normal(-mean, cov_lower)
-            return linop, bias
+            return _conditional.Conditional(linop, bias)
 
         return new
 
@@ -142,7 +142,7 @@ def new(fun, rv, /):
 
             bias = _normal.Normal(-mean, cov_lower)
             linop = _jac_materialize(lambda v, _p: a1(v), inputs=rv.mean)
-            return linop, bias
+            return _conditional.Conditional(linop, bias)
 
         return new
 
@@ -228,7 +228,7 @@ def linearise_fun_wrapped(fun, rv):
             )
             cov_lower = damp * np.eye(1)
             bias = _normal.Normal(-fx, cov_lower)
-            return linop, bias
+            return _conditional.Conditional(linop, bias)
 
         return linearise_fun_wrapped
 
@@ -261,7 +261,7 @@ def lo(s):
             d, *_ = linop.shape
             cov_lower = damp * np.ones((d, 1, 1))
             bias = _normal.Normal(-fx[:, None], cov_lower)
-            return linop, bias
+            return _conditional.Conditional(linop, bias)
 
         return linearise_fun_wrapped
 
diff --git a/probdiffeq/ivpsolvers.py b/probdiffeq/ivpsolvers.py
@@ -234,9 +234,8 @@ def complete(self, _ssv, extra, /, output_scale):
             cond, (p, p_inv), rv_p = extra
 
             # Extrapolate the Cholesky factor (re-extrapolate the mean for simplicity)
-            A, noise = cond
-            noise = self.ssm.stats.rescale_cholesky(noise, output_scale)
-            extrapolated_p, cond_p = self.ssm.conditional.revert(rv_p, (A, noise))
+            cond = self.ssm.conditional.rescale_noise(cond, output_scale)
+            extrapolated_p, cond_p = self.ssm.conditional.revert(rv_p, cond)
             extrapolated = self.ssm.normal.preconditioner_apply(extrapolated_p, p)
             cond = self.ssm.conditional.preconditioner_apply(cond_p, p, p_inv)
 
@@ -331,9 +330,8 @@ def complete(self, _ssv, extra, /, output_scale):
             cond, (p, p_inv), rv_p = extra
 
             # Extrapolate the Cholesky factor (re-extrapolate the mean for simplicity)
-            A, noise = cond
-            noise = self.ssm.stats.rescale_cholesky(noise, output_scale)
-            extrapolated_p = self.ssm.conditional.marginalise(rv_p, (A, noise))
+            cond = self.ssm.conditional.rescale_noise(cond, output_scale)
+            extrapolated_p = self.ssm.conditional.marginalise(rv_p, cond)
             extrapolated = self.ssm.normal.preconditioner_apply(extrapolated_p, p)
 
             # Gather and return
@@ -397,9 +395,8 @@ def complete(self, _rv, extra, /, output_scale):
             cond, (p, p_inv), rv_p, bw0 = extra
 
             # Extrapolate the Cholesky factor (re-extrapolate the mean for simplicity)
-            A, noise = cond
-            noise = self.ssm.stats.rescale_cholesky(noise, output_scale)
-            extrapolated_p, cond_p = self.ssm.conditional.revert(rv_p, (A, noise))
+            cond = self.ssm.conditional.rescale_noise(cond, output_scale)
+            extrapolated_p, cond_p = self.ssm.conditional.revert(rv_p, cond)
             extrapolated = self.ssm.normal.preconditioner_apply(extrapolated_p, p)
             cond = self.ssm.conditional.preconditioner_apply(cond_p, p, p_inv)
 
@@ -513,8 +510,8 @@ def init(self, x, /):
     def estimate_error(self, rv, /, t):
         """Perform all elements of the correction until the error estimate."""
         f_wrapped = self._parametrize_vector_field(t=t)
-        A, b = self.linearize(f_wrapped, rv)
-        observed = self.ssm.conditional.marginalise(rv, (A, b))
+        cond = self.linearize(f_wrapped, rv)
+        observed = self.ssm.conditional.marginalise(rv, cond)
 
         # TODO: the functions involved in error estimation are still a bit patchy.
         #  for instance, they assume that they are called
@@ -525,7 +522,7 @@ def estimate_error(self, rv, /, t):
         stdev = self.ssm.stats.standard_deviation(observed)
         error_estimate_unscaled = np.squeeze(stdev)
         error_estimate = output_scale * error_estimate_unscaled
-        return error_estimate, observed, (A, b, f_wrapped)
+        return error_estimate, observed, (cond, f_wrapped)
 
     def _parametrize_vector_field(self, *, t):
         if self.can_handle_higher_order:
@@ -539,10 +536,11 @@ def f_wrapped(s):
 
     def complete(self, rv, cache, /):
         """Complete what has been left out by `estimate_error`."""
-        A, b, f_wrapped = cache
+        cond, f_wrapped = cache
         if self.use_re_linearize:
-            A, b = self.linearize(f_wrapped, rv)
-        observed, (_gain, corrected) = self.ssm.conditional.revert(rv, (A, b))
+            cond = self.linearize(f_wrapped, rv)
+        observed, reverted = self.ssm.conditional.revert(rv, cond)
+        corrected = reverted.noise
         return corrected, observed
 
 
diff --git a/probdiffeq/stats.py b/probdiffeq/stats.py