use gradient accumulation

awni · awni · commit 07be2b51cf29 · 2025-03-27T08:33:56.000-07:00
diff --git a/mlx_lm/tuner/trainer.py b/mlx_lm/tuner/trainer.py
@@ -12,7 +12,7 @@
 import mlx.nn as nn
 import numpy as np
 from mlx.nn.utils import average_gradients
-from mlx.utils import tree_flatten
+from mlx.utils import tree_flatten, tree_map
 from transformers import PreTrainedTokenizer
 
 from .datasets import CacheDataset
@@ -80,11 +80,11 @@ class TrainingArgs:
     )
 
 
-def default_loss(model, batch, lengths, cache):
+def default_loss(model, batch, lengths, cache=None):
     inputs = batch[:, :-1]
     targets = batch[:, 1:]
 
-    offset = cache[0].offset
+    offset = cache[0].offset if cache is not None else 0
     logits = model(inputs, cache=cache)
     logits = logits.astype(mx.float32)
 
@@ -184,6 +184,7 @@ def evaluate(
 
     seq_step_size = seq_step_size or max_seq_length
 
+    cache = make_prompt_cache(model)
     for _, batch in zip(
         index_iterator,
         iterate_batches(
@@ -193,13 +194,14 @@ def evaluate(
             max_seq_length=max_seq_length,
         ),
     ):
-        cache = make_prompt_cache(model)
         seq_length = batch[0].shape[1]
         for s in range(0, seq_length, seq_step_size):
             local_batch = (batch[0][:, s:s+seq_step_size], batch[1])
             losses, toks = loss(model, *local_batch, cache)
             all_losses += losses * toks
             ntokens += toks
+            if s + seq_step_size >= seq_length:
+                reset_prompt_cache(cache)
             mx.eval(all_losses, ntokens)
 
     all_losses = mx.distributed.all_sum(all_losses, stream=mx.cpu)
@@ -241,13 +243,14 @@ def train(
     if args.grad_checkpoint:
         grad_checkpoint(model.layers[0])
 
+    seq_step_size = args.seq_step_size or args.max_seq_length
     cache = make_prompt_cache(model)
     state = [model.state, optimizer.state, mx.random.state]
 
     @partial(mx.compile, inputs=state, outputs=state)
     def step(batch):
         # Forward and backward pass
-        (lvalue, toks), grad = loss_value_and_grad(model, *batch, cache)
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
 
         # All reduce the gradients if running in distributed mode
         grad = average_gradients(grad)
@@ -264,6 +267,38 @@ def step(batch):
 
     model.train()
     seq_step_size = args.seq_step_size or args.max_seq_length
+    def seq_split_step(batch):
+        losses = mx.array(0.0)
+        n_tokens = mx.array(0.0)
+        seq_length = batch[0].shape[1]
+        grad_accum = None
+        for s in range(0, seq_length, seq_step_size):
+            local_batch = (batch[0][:, s:s+seq_step_size], batch[1])
+            (lvalue, toks), grad = loss_value_and_grad(model, *local_batch, cache)
+            prev_n_tokens = n_tokens
+            losses += toks * lvalue
+            n_tokens += toks
+
+            if grad_accum is None:
+                grad_accum = grad
+            else:
+                scale_g = toks / n_tokens
+                scale_acc = prev_n_tokens / n_tokens
+                grad_accum = tree_map(lambda g, acc: scale_g * g + scale_acc * acc, grad, grad_accum)
+
+
+            # Let go of the prompt cache before the last eval
+            if s + seq_step_size >= seq_length:
+                reset_prompt_cache(cache)
+            mx.eval(grad_accum, losses, n_tokens)
+
+        grad_accum = average_gradients(grad_accum)
+        optimizer.update(model, grad_accum)
+        return losses / n_tokens, n_tokens
+
+    loss_value_and_grad = nn.value_and_grad(model, loss)
+
+>>>>>>> 568a8d6 (use gradient accumulation)
     losses = 0
     n_tokens = 0
     steps = 0
@@ -317,15 +352,16 @@ def step(batch):
 
             tic = time.perf_counter()
 
-        seq_length = batch[0].shape[1]
-        for s in range(0, seq_length, seq_step_size):
-            local_batch = (batch[0][:, s:s+seq_step_size], batch[1])
-            lvalue, toks = step(local_batch)
-            losses += lvalue
-            n_tokens += toks
-            steps += 1
-            mx.eval(state, losses, n_tokens)
-        reset_prompt_cache(cache)
+        if batch[0].shape[1] > seq_step_size:
+            lvalue, toks = seq_split_step(batch)
+        else:
+            lvalue, toks = step(batch)
+
+        losses += lvalue
+        n_tokens += toks
+        steps += 1
+        mx.eval(state, losses, n_tokens)
+
         train_time += time.perf_counter() - tic
 
         # Report training loss if needed