manager: error reporting APIs and numerics test

d4l3k · d4l3k · commit 9c13c5f1c690 · 2024-11-27T19:13:58.000-08:00
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -178,7 +178,7 @@ def allreduce_grad(self, grad: torch.Tensor) -> torch.futures.Future[torch.Tenso
         Returns:
             a Future that will be completed with the allreduced gradient
         """
-        if self._errored:
+        if self.errored():
             fut = torch.futures.Future()
             fut.set_result(grad)
             return fut
@@ -195,38 +195,81 @@ def allreduce_grad(self, grad: torch.Tensor) -> torch.futures.Future[torch.Tenso
             work = self._pg.allreduce([grad], ReduceOp.SUM)
             fut = work.get_future()
 
-            # schedule error handling and grad normalization as a continuation
+            # schedule grad normalization as a continuation
             # on the Future
             def callback(
                 fut: torch.futures.Future[List[torch.Tensor]],
             ) -> torch.futures.Future[torch.Tensor]:
                 nonlocal grad
 
-                try:
-                    val = fut.value()
-                except Exception:
-                    logger.exception(
-                        "got exception in all reduce future -- skipping remaining"
-                    )
-                    self._errored = True
-                    return grad
+                fut.value()
 
                 grad /= self.num_participants()
 
                 return grad
 
             fut = fut.then(callback)
-            self._pending_work.append(fut)
+            fut = self.wrap_future(fut, grad)
             return fut
 
         except Exception as e:
-            logger.exception("got exception in all reduce -- skipping remaining")
-            self._errored = True
+            logger.exception(f"got exception in all reduce -- skipping remaining: {e}")
+            self.report_error()
 
             fut = torch.futures.Future()
             fut.set_result(grad)
             return fut
 
+    def report_error(self) -> None:
+        """
+        Report an error to the manager.
+
+        This will cause the manager to skip the current step and will be
+        reconfigured on the next step.
+
+        This should be called when an error occurs that leads to a corrupted
+        gradient that needs to be discarded.
+        """
+        self._errored = True
+
+    def errored(self) -> bool:
+        """
+        Get whether an error has occurred.
+
+        Returns:
+            whether an error has occurred
+        """
+        return self._errored
+
+    def wrap_future(self, fut: torch.futures.Future[object], default: object) -> None:
+        """
+        Wrap a Future and swallow any errors that occur and report them to the manager.
+
+        If an error occurs, the Future will be completed with the default value.
+
+        Args:
+            fut: the Future to wrap
+            default: the default value to complete the Future with if an error occurs
+        """
+
+        # schedule error handling and grad normalization as a continuation
+        # on the Future
+        def callback(
+            fut: torch.futures.Future[List[torch.Tensor]],
+        ) -> torch.futures.Future[torch.Tensor]:
+            nonlocal default
+
+            try:
+                return fut.value()
+            except Exception as e:
+                logger.exception(f"got exception in future -- skipping remaining: {e}")
+                self.report_error()
+                return default
+
+        fut = fut.then(callback)
+        self._pending_work.append(fut)
+        return fut
+
     def step(self) -> None:
         """
         .. note::
diff --git a/torchft/manager_test.py b/torchft/manager_test.py
@@ -10,7 +10,7 @@
 import torch
 from torch.distributed import TCPStore
 from torchft.manager import Manager, MANAGER_ADDR_KEY
-from torchft.process_group import ProcessGroup
+from torchft.process_group import _DummyWork, ProcessGroup
 
 from torchft.torchft import ManagerClient
 
@@ -311,3 +311,40 @@ def test_allreduce_error(self, client_mock) -> None:
         manager.step()
         manager.allreduce_grad(torch.tensor([1.0])).wait()
         self.assertTrue(manager.should_commit())
+
+    @patch("torchft.manager.ManagerClient", autospec=True)
+    def test_manager_report_error(self, client_mock) -> None:
+        manager = self._create_manager()
+
+        self.assertFalse(manager.errored())
+        manager.report_error()
+        self.assertTrue(manager.errored())
+
+    @patch("torchft.manager.ManagerClient", autospec=True)
+    def test_manager_wrap_future(self, client_mock) -> None:
+        manager = self._create_manager()
+
+        self.assertFalse(manager.errored())
+
+        fut = torch.futures.Future()
+        wrapped_fut = manager.wrap_future(fut, 2)
+
+        fut.set_exception(RuntimeError("injected failure"))
+
+        self.assertEqual(wrapped_fut.value(), 2)
+        self.assertTrue(manager.errored())
+        self.assertEqual(manager._pending_work, [wrapped_fut])
+
+    @patch("torchft.manager.ManagerClient", autospec=True)
+    def test_manager_numerics(self, client_mock) -> None:
+        manager = self._create_manager()
+
+        manager._quorum_future = MagicMock()
+        manager._participating_replicas = 5
+        self.assertEqual(manager.num_participants(), 5)
+        manager._pg.allreduce.return_value = _DummyWork(None)
+
+        fut = torch.futures.Future()
+        fut = manager.allreduce_grad(torch.tensor([1.0]))
+        result = fut.value()
+        torch.testing.assert_close(result, torch.tensor([1.0 / 5]))