local_sgd: initial version of fault tolerant LocalSGD (#47)

d4l3k · web-flow · commit 6d6e9a4df51b · 2024-12-18T14:52:56.000-08:00
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -17,6 +17,7 @@ the entire training job.
     manager
     optim
     ddp
+    local_sgd
     data
     checkpointing
     parameter_server
diff --git a/docs/source/local_sgd.rst b/docs/source/local_sgd.rst
@@ -0,0 +1,4 @@
+.. automodule:: torchft.local_sgd
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/torchft/ddp.py b/torchft/ddp.py
@@ -68,7 +68,7 @@ def __init__(self, manager: "Manager", module: nn.Module, **kwargs: object) -> N
     def _comm_hook(
         state: "Manager", bucket: dist.GradBucket
     ) -> torch.futures.Future[torch.Tensor]:
-        return state.allreduce_grad(bucket.buffer())
+        return state.allreduce(bucket.buffer())
 
 
 class PureDistributedDataParallel(nn.Module):
@@ -88,7 +88,7 @@ def __init__(self, manager: "Manager", module: nn.Module) -> None:
 
         def post_grad_hook(p: torch.Tensor) -> None:
             if p.grad is not None:
-                manager.allreduce_grad(p.grad)
+                manager.allreduce(p.grad)
 
         for p in module.parameters():
             p.register_post_accumulate_grad_hook(post_grad_hook)
diff --git a/torchft/ddp_test.py b/torchft/ddp_test.py
@@ -32,14 +32,14 @@ def test_pure_ddp(self) -> None:
         for p in m.parameters():
             self.assertIsNotNone(p.grad)
 
-        self.assertEqual(manager.allreduce_grad.call_count, len(list(m.parameters())))
+        self.assertEqual(manager.allreduce.call_count, len(list(m.parameters())))
 
     def test_ddp(self) -> None:
         manager = create_autospec(Manager)
 
         call_count = 0
 
-        def allreduce_grad(tensor: torch.Tensor) -> Future[torch.Tensor]:
+        def allreduce(tensor: torch.Tensor) -> Future[torch.Tensor]:
             nonlocal call_count
 
             call_count += 1
@@ -48,7 +48,7 @@ def allreduce_grad(tensor: torch.Tensor) -> Future[torch.Tensor]:
             fut.set_result(tensor)
             return fut
 
-        manager.allreduce_grad = allreduce_grad
+        manager.allreduce = allreduce
 
         m = nn.Linear(3, 4)
         m = DistributedDataParallel(manager, m)
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+LocalSGD
+=========
+
+This module implements a fault tolerant version of LocalSGD and related methods.
+"""
+
+from typing import Any, Dict, List, Mapping, Optional
+
+import torch
+from torch import nn, optim
+
+from torchft.manager import Manager
+
+
+class LocalSGD(nn.Module):
+    """
+    LocalSGD is a model wrapper similar to DistributedDataParallel that
+    implements the algorithm described in https://arxiv.org/pdf/1805.09767
+
+    This will synchronize the model parameters periodically in a fault tolerant
+    way using a torchft Manager. The allreduce on the parameters will happen
+    every sync_every steps after the optimizer.step call.
+
+    To implement safe and fault tolerant, this requires a backup copy of the
+    weights. By default these are stored in CPU memory. If any error occurs
+    during the LocalSGD step, the step will be discarded and the model
+    parameters will reset back to the last time LocalSGD synchronized.
+
+    The backup weights could be eliminated by relaxing the guarantee of exactly
+    `sync_every` steps but that would diverge from the LocalSGD algorithm.
+    DiLoCo also needs this backup copy to compute the delta.
+
+    The torchft quorum is computed at the beginning of ``sync_every`` steps. If
+    any error occurs, or a worker fails between syncs, ``sync_every`` steps will be
+    discarded and a new quorum will be computed on the next step.
+
+    If running in async mode, on a joining worker the first ``sync_every`` steps
+    will discarded as the model will be recovering during that period. When
+    using sync mode, the checkpoint will be restored prior to the first step.
+
+    TODO: add a way via Manager to detect workers failing early for shrink only
+    TODO: add DiLoCo support
+    """
+
+    def __init__(
+        self,
+        manager: Manager,
+        model: nn.Module,
+        optimizer: optim.Optimizer,
+        sync_every: int,
+        backup_device: Optional[torch.device] = None,
+        pin_memory: bool = True,
+    ) -> None:
+        """
+        Args:
+            manager: The manager to use.
+            model: The model to wrap.
+            optimizer: The optimizer used by the model.
+            sync_every: How often to sync the model weights.
+            backup_device: The device to store the backup of the model parameters on. (default cpu)
+            pin_memory: Whether to pin the memory used for the backup of the model parameters.
+        """
+        super().__init__()
+
+        self._manager = manager
+        self._model = model
+        self._local_step = 0
+        self._started_step = False
+        self._sync_every = sync_every
+        assert sync_every >= 1, "sync_every must be greater than or equal to 1"
+
+        device = backup_device or torch.device("cpu")
+
+        self._backup_parameters: Dict[str, torch.Tensor] = {}
+
+        for name, p in self._model.named_parameters():
+            t = torch.empty(*tuple(p.shape), dtype=p.dtype, device=device)
+            if (
+                pin_memory
+                and t.device == torch.device("cpu")
+                and torch.cuda.is_available()
+            ):
+                t = t.pin_memory()
+            self._backup_parameters[name] = t
+
+        # Need to copy the parameters to the host to be safe if we are on the first step.
+        self._save_parameters()
+
+        optimizer.register_step_post_hook(self._step_post_hook)
+
+    def _save_parameters(self) -> None:
+        # TODO: consider running copy on a separate stream
+        for name, p in self._model.named_parameters():
+            self._backup_parameters[name].copy_(p.data, non_blocking=True)
+
+    def _restore_parameters(self) -> None:
+        # TODO: consider running copy on a separate stream
+        for name, p in self._model.named_parameters():
+            p.data.copy_(self._backup_parameters[name], non_blocking=True)
+
+    # pyre-fixme[14]: support state_dict args
+    def state_dict(self) -> Dict[str, object]:
+        """
+        state_dict returns the state_dict from the last time LocalSGD
+        synchronized and not the current weights.
+        """
+        state_dict = self._model.state_dict()
+        for name, p in self._backup_parameters.items():
+            assert name in state_dict
+            state_dict[name] = p
+        return state_dict
+
+    def load_state_dict(
+        self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False
+    ) -> None:
+        """
+        Loads the state dict to the model and the backup parameters.
+
+        This must be called while the model weights aren't being modified to
+        avoid corrupting the backup weights.
+        """
+        self._model.load_state_dict(state_dict, strict=strict, assign=assign)
+        self._save_parameters()
+
+    def forward(self, *args: object, **kwargs: object) -> object:
+        """
+        Run the model parameters.
+
+        This should be called before the optimizer step.
+
+        This will start the quorum and save the parameters if this is the first step.
+        """
+        if self._local_step == 0:
+            self._manager.start_quorum()
+
+        self._started_step = True
+
+        return self._model.forward(*args, **kwargs)
+
+    def _step_post_hook(
+        self, _optim: optim.Optimizer, _args: List[object], _kwargs: Dict[str, object]
+    ) -> None:
+        """
+        This hook is registered on the optimizer and is called after the optimizer step.
+
+        This will call the allreduce on the model weights every sync_every steps.
+        If any errors occur it will restore to the weights from the previous sync.
+
+        ``forward`` must be called before this function.
+        """
+        assert self._started_step, "forward must be called before step"
+        self._started_step = False
+
+        self._local_step += 1
+
+        if self._local_step >= self._sync_every:
+            self._local_step = 0
+            self._average()
+
+            if self._manager.should_commit():
+                # save the parameters so we can restore from them later if necessary.
+                self._save_parameters()
+            else:
+                # commit failed, restore from the backup parameters
+                self._restore_parameters()
+
+    def _average(self) -> None:
+        # TODO: do we need to broadcast buffers like DDP does?
+
+        works = []
+
+        for p in self._model.parameters():
+            # TODO: bucketize parameters
+            works.append(self._manager.allreduce(p))
+
+        for work in works:
+            work.wait()
diff --git a/torchft/local_sgd_test.py b/torchft/local_sgd_test.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+from unittest import TestCase
+from unittest.mock import create_autospec
+
+import torch
+from torch import nn, optim
+
+from torchft.local_sgd import LocalSGD
+from torchft.manager import Manager
+
+
+class SimpleModel(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.model = nn.Sequential(
+            nn.Linear(3, 4),
+            nn.ReLU(),
+            nn.Linear(4, 5),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+
+def _params_dict(m: torch.nn.Module) -> Dict[str, torch.Tensor]:
+    return {name: p.data for name, p in m.named_parameters()}
+
+
+def _copy_state_dict(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    return {name: value.clone().detach() for name, value in state_dict.items()}
+
+
+class LocalSGDTest(TestCase):
+    def test_local_sgd_healthy(self) -> None:
+        base_m = SimpleModel()
+        optimizer = optim.SGD(base_m.parameters())
+        manager = create_autospec(Manager)
+
+        m = LocalSGD(manager, base_m, optimizer, sync_every=2)
+        self.assertEqual(m._local_step, 0)
+
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+
+        inp = torch.rand(2, 3)
+
+        loss = m(inp).mean()
+        loss.backward()
+        optimizer.step()
+
+        self.assertEqual(m._local_step, 1)
+        self.assertEqual(manager.start_quorum.call_count, 1)
+
+        loss = m(inp).mean()
+        loss.backward()
+        optimizer.step()
+
+        manager.should_commit.return_value = True
+        self.assertEqual(m._local_step, 0)
+
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+        self.assertEqual(manager.should_commit.call_count, 1)
+        self.assertEqual(manager.allreduce.call_count, 4)
+
+    def test_local_sgd_recovery(self) -> None:
+        base_m = SimpleModel()
+        optimizer = optim.SGD(base_m.parameters())
+        manager = create_autospec(Manager)
+
+        m = LocalSGD(manager, base_m, optimizer, sync_every=2)
+
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+        og_state_dict = _copy_state_dict(base_m.state_dict())
+
+        inp = torch.rand(2, 3)
+
+        loss = m(inp).mean()
+        loss.backward()
+        optimizer.step()
+
+        self.assertEqual(m._local_step, 1)
+
+        state_dict = m.state_dict()
+        torch.testing.assert_close(state_dict, m._backup_parameters)
+        torch.testing.assert_close(state_dict, og_state_dict)
+
+        m.load_state_dict(state_dict)
+        torch.testing.assert_close(_params_dict(base_m), state_dict)
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
diff --git a/torchft/manager.py b/torchft/manager.py
diff --git a/torchft/manager_test.py b/torchft/manager_test.py