Merge branch 'refactor-eigendecomposition' into eigendecomposed-shampoo

runame · runame · commit 5ec66cfa12eb · 2025-03-24T18:09:30.000Z
diff --git a/matrix_functions.py b/matrix_functions.py
@@ -14,7 +14,6 @@
 from dataclasses import asdict
 from fractions import Fraction
 from math import isfinite
-from typing import NamedTuple
 
 import torch
 from matrix_functions_types import (
@@ -193,7 +192,7 @@ def matrix_eigendecomposition(
     elif A.shape[0] != A.shape[1]:
         raise ValueError("Matrix is not square!")
 
-    # return the diagonal of A and the identity matrix if A is diagonal
+    # Return the (sorted) diagonal of A and identity matrix if A is diagonal.
     if is_diagonal:
         return A.diag(), torch.eye(
             A.shape[0],
@@ -258,20 +257,13 @@ def _eigh_eigenvalue_decomposition(
     return L.to(device=current_device), Q.to(device=current_device)
 
 
-class Criterion(NamedTuple):
-    """Named tuple for result of convergence criterion."""
-
-    converged: bool
-    approximate_eigenvalues: Tensor
-
-
-def _approximate_eigenvalues_criterion_below_or_equal_tolerance(
-    A: Tensor, Q: Tensor, tolerance: float
-) -> Criterion:
-    """Evaluates if a criterion using approximate eigenvalues is below or equal to the tolerance.
+def _estimated_eigenvalues_criterion_below_or_equal_tolerance(
+    estimated_eigenvalues: Tensor, tolerance: float
+) -> bool:
+    """Evaluates if a criterion using estimated eigenvalues is below or equal to the tolerance.
 
     Let Q^T A Q =: B be the estimate of the eigenvalues of the matrix A, where Q is the matrix containing the last computed eigenvectors.
-    The approximate eigenvalues criterion is defined as ||B - diag(B)||_F <= tolerance * ||B||_F.
+    The criterion based on the estimated eigenvalues is defined as ||B - diag(B)||_F <= tolerance * ||B||_F.
     The tolerance hyperparameter should therefore be in the interval [0.0, 1.0].
 
     This convergence criterion can be motivated by considering A' = Q diag(B) Q^T as an approximation of A.
@@ -285,17 +277,13 @@ def _approximate_eigenvalues_criterion_below_or_equal_tolerance(
         tolerance (float): The tolerance for the criterion.
 
     Returns:
-        Criterion: Named tuple with the fields 'converged' and 'approximate_eigenvalues':
-            converged (bool): whether the criterion is below or equal to the tolerance.
-            approximate_eigenvalues (Tensor): the approximate eigenvalues of A as a vector.
+        bool: True if the criterion is below or equal to the tolerance, False otherwise.
 
     """
-    approximate_eigenvalues = Q.T @ A @ Q
-    norm = torch.linalg.norm(approximate_eigenvalues)
-    diagonal_norm = torch.linalg.norm(approximate_eigenvalues.diag())
+    norm = torch.linalg.norm(estimated_eigenvalues)
+    diagonal_norm = torch.linalg.norm(estimated_eigenvalues.diag())
     off_diagonal_norm = torch.sqrt(norm**2 - diagonal_norm**2)
-    converged = bool(off_diagonal_norm <= tolerance * norm)
-    return Criterion(converged, approximate_eigenvalues.diag())
+    return bool(off_diagonal_norm <= tolerance * norm)
 
 
 def _qr_algorithm(
@@ -304,13 +292,12 @@ def _qr_algorithm(
     max_iterations: int = 1,
     tolerance: float = 0.01,
 ) -> tuple[Tensor, Tensor]:
-    """
-    Approximately compute the eigendecomposition of a symmetric matrix by performing the QR algorithm.
+    """Approximately compute the eigendecomposition of a symmetric matrix by performing the QR algorithm.
 
     Given an initial estimate of the eigenvectors Q of matrix A, a power iteration and a QR decomposition is performed each iteration, i.e. Q, _ <- QR(A @ Q).
     When the initial estimate is the zero matrix, the eigendecomposition is computed using _eigh_eigenvalue_decomposition.
 
-    Note that if the approximate eigenvalues criterion is already below or equal to the tolerance given the initial eigenvectors_estimate, the QR iterations will be skipped.
+    Note that if the criterion based on the estimated eigenvalues is already below or equal to the tolerance given the initial eigenvectors_estimate, the QR iterations will be skipped.
 
     Args:
         A (Tensor): The symmetric input matrix.
@@ -320,35 +307,30 @@ def _qr_algorithm(
             (Default: 0.01)
 
     Returns:
-        tuple[Tensor, Tensor]: The approximate eigenvalues and eigenvectors of the input matrix A.
+        tuple[Tensor, Tensor]: The estimated eigenvalues and eigenvectors of the input matrix A.
 
     """
     if not eigenvectors_estimate.any():
         return _eigh_eigenvalue_decomposition(A)
 
     # Perform orthogonal/simultaneous iterations (QR algorithm).
     Q = eigenvectors_estimate
+    estimated_eigenvalues = Q.T @ A @ Q
     iteration = 0
-    # NOTE: This will skip the QR iterations if the approximate eigenvalues criterion is already below or equal to the tolerance given the initial eigenvectors_estimate.
+    # NOTE: This will skip the QR iterations if the criterion is already below or equal to the tolerance given the initial eigenvectors_estimate.
     while (
         iteration < max_iterations
-        and not (
-            criterion := _approximate_eigenvalues_criterion_below_or_equal_tolerance(
-                A, Q, tolerance
-            )
-        ).converged
+        and not _estimated_eigenvalues_criterion_below_or_equal_tolerance(
+            estimated_eigenvalues, tolerance
+        )
     ):
         power_iteration = A @ Q
         Q = torch.linalg.qr(power_iteration).Q
         iteration += 1
+        estimated_eigenvalues = Q.T @ A @ Q
 
     # Ensure consistent ordering of estimated eigenvalues and eigenvectors.
-    estimated_eigenvalues = (
-        criterion.approximate_eigenvalues
-        if iteration == 0  # Re-use approximate eigenvalues if iterations were skipped.
-        else torch.einsum("ij, ik, kj -> j", Q, A, Q)
-    )
-    estimated_eigenvalues, indices = estimated_eigenvalues.sort()
+    estimated_eigenvalues, indices = estimated_eigenvalues.diag().sort(stable=True)
     Q = Q[:, indices]
 
     return estimated_eigenvalues, Q
diff --git a/matrix_functions_types.py b/matrix_functions_types.py
@@ -53,11 +53,11 @@ def __post_init__(self) -> None:
 class QREigendecompositionConfig(EigendecompositionConfig):
     """Configuration for eigenvalue decomposition via QR algorithm.
 
-    Determines whether the QR algorithm has converged based on the approximate eigenvalues Q^T A Q =: B, where Q is the last computed eigenvectors and A is the current Kronecker factor.
-    The approximate eigenvalues update criterion is then defined as ||B - diag(B)||_F <= tolerance * ||B||_F.
+    Determines whether the QR algorithm has converged based on the estimated eigenvalues Q^T A Q =: B, where Q is the last computed eigenvectors and A is the current Kronecker factor.
+    The convergence criterion based on the estimated eigenvalues is then defined as ||B - diag(B)||_F <= tolerance * ||B||_F.
     The tolerance hyperparameter should therefore be in the interval [0.0, 1.0].
 
-    Note that if the approximate eigenvalues criterion is already below or equal to the tolerance given the initial eigenvectors_estimate, the QR iterations will be skipped.
+    Note that if the criterion based on the estimated eigenvalues is already below or equal to the tolerance given the initial eigenvectors_estimate, the QR iterations will be skipped.
 
     This convergence criterion can be motivated by considering A' = Q diag(B) Q^T as an approximation of A.
     We have ||A - A'||_F = ||A - Q diag(B) Q^T||_F = ||Q^T A Q - diag(B)||_F = ||B - diag(B)||_F.
diff --git a/tests/matrix_functions_test.py b/tests/matrix_functions_test.py
@@ -820,20 +820,20 @@ def test_matrix_eigendecomposition(self) -> None:
         ]
         expected_eigenvalues_list = [
             torch.tensor([1.0, 4.0]),
-            torch.tensor([2.9009e-03, 1.7424e-01, 1.9828e03]),
+            torch.tensor([2.9008677229e-03, 1.7424316704e-01, 1.9828229980e03]),
         ]
         expected_eigenvectors_list = [
             torch.tensor([[1.0, 0.0], [0.0, 1.0]]),
             torch.tensor(
                 [
-                    [0.0460, -0.6287, 0.7763],
-                    [-0.1752, -0.7702, -0.6133],
-                    [0.9835, -0.1078, -0.1455],
+                    [0.0460073575, -0.6286827326, 0.7762997746],
+                    [-0.1751257628, -0.7701635957, -0.6133345366],
+                    [0.9834705591, -0.1077321917, -0.1455317289],
                 ]
             ),
         ]
 
-        atol = 0.05  # TODO: Ensure consistent ordering of the eigenvectors and decrease tolerance.
+        atol = 1e-4
         rtol = 1e-5
         with self.subTest("Test with diagonal case."):
             torch.testing.assert_close(
@@ -876,12 +876,18 @@ def test_matrix_eigendecomposition(self) -> None:
                 )
 
         # Tests for `QREigendecompositionConfig`.
-        initialization_strategies = {
-            "zero": lambda A: torch.zeros_like(A),
-            "identity": lambda A: torch.eye(A.shape[0], dtype=A.dtype, device=A.device),
-            "exact": lambda A: matrix_eigendecomposition(A)[1],
+        initialization_strategies_to_functions_atol = {
+            "zero": (lambda A: torch.zeros_like(A), atol),
+            "identity": (
+                lambda A: torch.eye(A.shape[0], dtype=A.dtype, device=A.device),
+                2e-3,
+            ),
+            "exact": (lambda A: matrix_eigendecomposition(A)[1], 2e-3),
         }
-        for name, initialization_fn in initialization_strategies.items():
+        for name, (
+            initialization_fn,
+            atol,
+        ) in initialization_strategies_to_functions_atol.items():
             with self.subTest(
                 f"Test with QREigendecompositionConfig with {name} initialization."
             ):