Change flattening behaviour in Kron

rwightman · rwightman · commit 6ad162c02899 · 2025-01-30T13:07:20.000-08:00
diff --git a/timm/optim/kron.py b/timm/optim/kron.py
@@ -94,7 +94,8 @@ class Kron(torch.optim.Optimizer):
         mu_dtype: Dtype of the momentum accumulator.
         precond_dtype: Dtype of the preconditioner.
         decoupled_decay: AdamW style decoupled weight decay
-        flatten_dim: Flatten dim >= 2 instead of relying on expressions
+        flatten: Flatten dimensions instead of fully relying on expressions for higher rank params
+        flatten_start_end: Range of dimensions to flatten, defaults to (2, -1).
         deterministic: Deterministic behaviour across save / load (resume). FIXME slow, needs work
     """
 
@@ -114,7 +115,8 @@ def __init__(
         mu_dtype: Optional[torch.dtype] = None,
         precond_dtype: Optional[torch.dtype] = None,
         decoupled_decay: bool = False,
-        flatten_dim: bool = False,
+        flatten: bool = False,
+        flatten_start_end: Tuple[int, int] = (2, -1),
         deterministic: bool = False,
     ):
         if not has_opt_einsum:
@@ -141,7 +143,8 @@ def __init__(
             mu_dtype=mu_dtype,
             precond_dtype=precond_dtype,
             decoupled_decay=decoupled_decay,
-            flatten_dim=flatten_dim,
+            flatten=flatten,
+            flatten_start_end=flatten_start_end,
         )
         super(Kron, self).__init__(params, defaults)
 
@@ -229,8 +232,11 @@ def step(self, closure=None):
 
                 grad = p.grad
                 state = self.state[p]
-                if group['flatten_dim']:
-                    grad = grad.view(grad.size(0), -1)
+
+                flattened = False
+                if group['flatten']:
+                    grad = safe_flatten(grad, *group["flatten_start_end"])
+                    flattened = True
 
                 if len(state) == 0:
                     state["step"] = 0
@@ -341,7 +347,7 @@ def step(self, closure=None):
 
                 # RMS of pre_grad should be 1.0, so let's cap at 1.1
                 pre_grad.mul_(torch.clamp(1.1 / (pre_grad.square().mean().sqrt_() + 1e-8), max=1.0))
-                if group['flatten_dim']:
+                if flattened:
                     pre_grad = pre_grad.view(p.shape)
 
                 # Apply weight decay
@@ -361,6 +367,20 @@ def step(self, closure=None):
         return loss
 
 
+def safe_flatten(tensor, start_dim=0, end_dim=-1):
+    ndim = tensor.ndim
+
+    # Convert negative end_dim to positive and clip to end
+    end_dim = min(end_dim if end_dim >= 0 else ndim + end_dim, ndim - 1)
+
+    # If tensor has fewer dims than start_dim or start > end, return tensor as is
+    if ndim <= start_dim or start_dim > end_dim:
+        return tensor
+
+    # Now safe to flatten
+    return tensor.flatten(start_dim, end_dim)
+
+
 def _init_Q_exprs(
         t,
         scale,