process_group: make ManagedProcessGroup use wrap_future

d4l3k · d4l3k · commit 5bc2a698f284 · 2024-12-12T16:26:56.000-08:00
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -436,29 +436,62 @@ def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
             return _DummyWork(tensors)
 
 
-class ManagedProcessGroup(ErrorSwallowingProcessGroupWrapper):
+class _ManagedWork(Work):
+    def __init__(self, manager: "Manager", work: Work, default_result: object) -> None:
+        super().__init__()
+
+        self._manager = manager
+        self._work = work
+        self._default_result = default_result
+
+    def wait(self, timeout: Optional[timedelta] = None) -> bool:
+        try:
+            if timeout is not None:
+                self._work.wait(timeout)
+            else:
+                self._work.wait()
+        except Exception as e:
+            self._manager.report_error(e)
+
+        return True
+
+    def get_future(self) -> Future[object]:
+        return self._manager.wrap_future(self._work.get_future(), self._default_result)
+
+
+class ManagedProcessGroup(ProcessGroupWrapper):
     """
     This is a wrapper around any ProcessGroup that is managed by a torchft
     Manager.
+
+    This uses the ProcessGroup that is configured in the Manager. The world size
+    is dynamic and will report the number of active particpants in the quorum to
+    the model.
+
+    Any errors will be asynchronously reported to the manager and only successes
+    will be returned to the caller.
     """
 
     def __init__(self, manager: "Manager") -> None:
         super().__init__(manager._pg)
 
         self._manager = manager
 
-    def report_error(self, e: Exception) -> None:
-        """
-        Report an error to this process group. This will cause all future
-        operations to be skipped until the process group is reconfigured via
-        ``configure``.
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        if self._manager.errored() is not None:
+            return _DummyWork(tensors)
 
-        Args:
-            e: exception to report
-        """
-        super().report_error(e)
+        try:
+            work = super().allreduce(tensors, opts)
+        except Exception as e:
+            self._manager.report_error(e)
+            return _DummyWork(tensors)
 
-        self._manager.report_error(e)
+        return _ManagedWork(
+            self._manager,
+            work,
+            tensors,
+        )
 
     def size(self) -> int:
         return self._manager.num_participants()
diff --git a/torchft/process_group_test.py b/torchft/process_group_test.py
@@ -29,6 +29,7 @@
     ProcessGroupWrapper,
     _DummyWork,
     _ErrorSwallowingWork,
+    _ManagedWork,
     extend_device_mesh,
 )
 
@@ -238,13 +239,19 @@ def test_error_swallowing_process_group_wrapper(self) -> None:
 
     def test_managed_process_group(self) -> None:
         manager = Mock(spec=Manager)
+        manager.errored.return_value = None
         manager._pg = ProcessGroupDummy(0, 1)
         pg = ManagedProcessGroup(manager)
         manager.num_participants.return_value = 123
 
         self.assertEqual(pg.size(), 123)
 
-        err = RuntimeError("test")
-        pg.report_error(err)
-        self.assertEqual(pg.error(), err)
-        self.assertEqual(manager.report_error.call_count, 1)
+        t = torch.zeros(10)
+        work = pg.allreduce([t], ReduceOp.SUM)
+        self.assertIsInstance(work, _ManagedWork)
+        work.wait()
+        fut = work.get_future()
+        fut.wait()
+
+        self.assertEqual(manager.report_error.call_count, 0)
+        self.assertEqual(manager.wrap_future.call_count, 1)