accuracy fix

zhengchenyu · zhengchenyu · commit 9ff17b109655 · 2025-11-28T23:37:20.000+08:00
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -34,7 +34,7 @@
 import uuid
 import weakref
 from concurrent.futures import ThreadPoolExecutor
-from contextlib import nullcontext
+from contextlib import contextmanager
 from datetime import timedelta
 from enum import Enum
 from typing import (
@@ -454,8 +454,11 @@ def allreduce(
 
         # If dirty, the result will not be committed, so return empty tensor.
         if self._dataloader_dirty:
-            work = _DummyWork(torch.zeros_like(tensor))
-            return _ManagedWork(self, work, tensor)
+            tensor.zero_()
+            return _ManagedWork(self, _DummyWork(tensor), tensor)
+
+        if not self.require_backward_grad_sync:
+            return _ManagedWork(self, _DummyWork(tensor), tensor)
 
         num_participants: int = self.num_participants()
 
@@ -496,7 +499,7 @@ def callback(
             ) -> torch.Tensor:
                 nonlocal tensor
                 if reduce_op == ReduceOp.AVG:
-                    tensor /= num_participants
+                    tensor /= num_participants * self._accumulation_steps
                 return tensor
 
             managed_work = _ManagedWork(self, work, tensor)
@@ -513,6 +516,15 @@ def callback(
 
             return _DummyWork(tensor)
 
+    @contextmanager
+    def no_sync(self):
+        old_require_backward_grad_sync = self.require_backward_grad_sync
+        self.require_backward_grad_sync = False
+        try:
+            yield
+        finally:
+            self.require_backward_grad_sync = old_require_backward_grad_sync
+
     def report_error(self, e: Exception) -> None:
         """
         Report an error to the manager.
@@ -931,6 +943,9 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
         Raises:
             RuntimeError: if should_commit fails max_retries times in a row and max_retries is set
         """
+        # Sometime allreduce is not called before should_commit, we need to wait quorum
+        self.wait_quorum()
+
         # make sure recovery is complete before committing
         with torch.profiler.record_function(
             "torchft::manager::should_commmit::recovery_stream::synchronize"
diff --git a/train_ddp2.py b/train_ddp2.py
@@ -71,7 +71,7 @@ def load_model(m, optimizer, manager):
         with open(f"{CHECKPOINT_PATH}_latest", "r") as f:
             latest_checkpoint_path = f.read().strip()
         print(f"Loading checkpoint from {latest_checkpoint_path}")
-        loaded_state_dict = torch.load(latest_checkpoint_path)
+        loaded_state_dict = torch.load(latest_checkpoint_path, weights_only=True)
         m.load_state_dict(loaded_state_dict["model"])
         optimizer.load_state_dict(loaded_state_dict["optim"])
         manager.load_state_dict(loaded_state_dict["torchft"])
@@ -89,10 +89,12 @@ def main() -> None:
     )
 
     def load_state_dict(state_dict):
+        print("Received checkpoint!")
         m.load_state_dict(state_dict["model"])
         optimizer.load_state_dict(state_dict["optim"])
 
     def state_dict():
+        print("Setup checkpoint to send!")
         return {
             "model": m.state_dict(),
             "optim": optimizer.state_dict(),
@@ -206,21 +208,29 @@ def forward(self, x):
         ) is not None:
             optimizer.zero_grad()
             total_loss = 0.0
-            for inputs, labels in batches:
+            accumulation_steps = len(batches)
+            for i in range(accumulation_steps):
+                inputs, labels = batches[i]
                 inputs = inputs.to(device)
                 labels = labels.to(device)
                 out = m(inputs)
                 loss = criterion(out, labels)
-                loss.backward()
+                if i == accumulation_steps - 1:
+                    loss.backward()
+                else:
+                    with manager.no_sync():
+                        loss.backward()
                 total_loss += loss.item()
+
             # If errored, the optimizer step will be a no-op, and the parameter will not be updated.
             # Although it is possible to use new pg to compute old batches, it is still safe.
             if not optimizer.step():
                 continue
 
             # all reduce the loss across all replicas
-            total_loss /= len(batches)
+            total_loss = total_loss / BATCH_SIZE
             loss_tensor = torch.tensor(total_loss, device=device)
+            # manager all reduce will divide by replica world size * accumulation steps
             manager.allreduce(loss_tensor).wait()
             avg_loss = loss_tensor.item()
             if manager.participating_rank() == 0: