manager: rename step to start_step + small shutdown fix (#44)

d4l3k · web-flow · commit 78c5721f51cd · 2024-12-17T16:10:39.000-08:00
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -193,6 +193,7 @@ def shutdown(self) -> None:
         self._ckpt_server.shutdown()
         if self._manager is not None:
             self._manager.shutdown()
+        self._executor.shutdown()
 
     def allreduce_grad(self, grad: torch.Tensor) -> torch.futures.Future[torch.Tensor]:
         """
@@ -314,15 +315,16 @@ def callback(
         self._pending_work.append(cast(torch.futures.Future[object], fut))
         return fut
 
-    def step(self) -> None:
+    def start_step(self) -> None:
         """
         .. note::
             We recommend using the :py:class:`torchft.optim.OptimizerWrapper` instead of calling this directly.
 
-        Must be called before the forwards pass of each step.
-
         Computes a new quorum (potentially asynchronously) and readies the
         manager for a new step.
+
+        Must be called before the forwards pass of each step for best
+        performance as computing quorum may take some time.
         """
 
         if self._should_step:
diff --git a/torchft/manager_test.py b/torchft/manager_test.py
@@ -102,7 +102,7 @@ def test_quorum_happy(self, client_mock: MagicMock) -> None:
         self.assertEqual(manager._step, 0)
         self.assertEqual(manager.batches_committed(), 0)
 
-        manager.step()
+        manager.start_step()
         manager.allreduce_grad(torch.tensor([1.0])).wait()
         self.assertEqual(len(manager._pending_work), 1)
         self.assertTrue(manager.should_commit())
@@ -113,7 +113,7 @@ def test_quorum_happy(self, client_mock: MagicMock) -> None:
         # pyre-ignore[16]: _pg is mocked
         self.assertEqual(manager._pg.allreduce.call_count, 1)
 
-        manager.step()
+        manager.start_step()
         self.assertEqual(manager.batches_committed(), 2)
 
     @patch("torchft.manager.ManagerClient", autospec=True)
@@ -140,7 +140,7 @@ def test_quorum_heal_sync(self, client_mock: MagicMock) -> None:
         self.assertEqual(manager._quorum_id, -1)
         self.assertEqual(manager._step, 0)
 
-        manager.step()
+        manager.start_step()
         manager.allreduce_grad(torch.tensor([1.0])).wait()
         self.assertFalse(manager._healing)
         self.assertTrue(manager.is_participating())
@@ -182,7 +182,7 @@ def test_quorum_heal_async_not_enough_participants(
         self.assertEqual(manager._quorum_id, -1)
         self.assertEqual(manager._step, 0)
 
-        manager.step()
+        manager.start_step()
         assert manager._quorum_future is not None
         manager._quorum_future.result()
         self.assertTrue(manager._healing)
@@ -206,7 +206,7 @@ def test_quorum_heal_async_not_enough_participants(
         self.assertEqual(self.load_state_dict.call_count, 1)
 
         # failed to commit so no step
-        manager.step()
+        manager.start_step()
         self.assertEqual(manager._step, 20)
         self.assertEqual(manager.batches_committed(), 0)
 
@@ -234,7 +234,7 @@ def test_quorum_heal_async_zero_grad(self, client_mock: MagicMock) -> None:
         self.assertEqual(manager._quorum_id, -1)
         self.assertEqual(manager._step, 0)
 
-        manager.step()
+        manager.start_step()
         assert manager._quorum_future is not None
         manager._quorum_future.result()
         self.assertTrue(manager._healing)
@@ -256,7 +256,7 @@ def test_quorum_heal_async_zero_grad(self, client_mock: MagicMock) -> None:
 
         self.assertEqual(self.load_state_dict.call_count, 1)
 
-        manager.step()
+        manager.start_step()
         self.assertEqual(manager._step, 21)
         self.assertEqual(manager.batches_committed(), 1)
 
@@ -280,7 +280,7 @@ def test_allreduce_error(self, client_mock: MagicMock) -> None:
         self.assertEqual(manager._quorum_id, -1)
         self.assertEqual(manager._step, 0)
 
-        manager.step()
+        manager.start_step()
         manager.allreduce_grad(torch.tensor([1.0])).wait()
         # pyre-ignore[16]: _pg is mocked
         self.assertEqual(manager._pg.allreduce.call_count, 1)
@@ -314,7 +314,7 @@ def test_allreduce_error(self, client_mock: MagicMock) -> None:
             2,  # max_world_size
             False,  # heal
         )
-        manager.step()
+        manager.start_step()
 
         self.assertFalse(manager._errored)
 
@@ -343,7 +343,7 @@ def test_allreduce_error(self, client_mock: MagicMock) -> None:
             False,  # heal
         )
 
-        manager.step()
+        manager.start_step()
         manager.allreduce_grad(torch.tensor([1.0])).wait()
         self.assertTrue(manager.should_commit())
 
@@ -375,13 +375,13 @@ def test_quorum_fixed_world_size(self, client_mock: MagicMock) -> None:
             self.assertEqual(manager._step, 0)
             self.assertEqual(manager.batches_committed(), 0)
 
-            manager.step()
+            manager.start_step()
             manager.allreduce_grad(torch.tensor([1.0])).wait()
 
             self.assertEqual(manager.is_participating(), rank != 2)
             self.assertEqual(manager.num_participants(), 2)
 
-            manager.step()
+            manager.start_step()
             self.assertEqual(manager.batches_committed(), 2)
 
     @patch("torchft.manager.ManagerClient", autospec=True)
diff --git a/torchft/optim.py b/torchft/optim.py
@@ -45,7 +45,7 @@ def state_dict(self) -> object:
         return self.optim.state_dict()
 
     def zero_grad(self, set_to_none: bool = True) -> None:
-        self.manager.step()
+        self.manager.start_step()
         self.optim.zero_grad(set_to_none)
 
     def step(self, closure: Optional[object] = None) -> None:
diff --git a/torchft/optim_test.py b/torchft/optim_test.py
@@ -32,7 +32,7 @@ def test_optimizer_wrapper(self) -> None:
         optim.load_state_dict(optim.state_dict())
 
         optim.zero_grad()
-        self.assertEqual(manager.step.call_count, 1)
+        self.assertEqual(manager.start_step.call_count, 1)
 
         manager.should_commit.return_value = True
         optim.step()