manager: add per request timeouts

d4l3k · d4l3k · commit 2dcd290c1f9d · 2025-01-07T16:37:35.000-08:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -102,13 +102,15 @@ impl ManagerClient {
         })
     }
 
+    #[pyo3(signature = (room_id, rank, step, checkpoint_server_addr, timeout=None))]
     fn quorum(
         &mut self,
         py: Python<'_>,
         room_id: String,
         rank: i64,
         step: i64,
         checkpoint_server_addr: String,
+        timeout: Option<Duration>,
     ) -> PyResult<(i64, i64, i64, String, String, i64, Option<i64>, i64, bool)> {
         py.allow_threads(move || {
             let mut request = tonic::Request::new(ManagerQuorumRequest {
@@ -119,7 +121,7 @@ impl ManagerClient {
             });
             // This notifies the server about the timeout but doesn't affect the
             // endpoint timeout which we set on client creation.
-            request.set_timeout(self.timeout);
+            request.set_timeout(timeout.unwrap_or(self.timeout));
 
             let response = self
                 .runtime
@@ -140,12 +142,18 @@ impl ManagerClient {
         })
     }
 
-    fn checkpoint_address(&mut self, py: Python<'_>, rank: i64) -> PyResult<String> {
+    #[pyo3(signature = (rank, timeout=None))]
+    fn checkpoint_address(
+        &mut self,
+        py: Python<'_>,
+        rank: i64,
+        timeout: Option<Duration>,
+    ) -> PyResult<String> {
         py.allow_threads(move || {
             let mut request = tonic::Request::new(CheckpointAddressRequest { rank: rank });
             // This notifies the server about the timeout but doesn't affect the
             // endpoint timeout which we set on client creation.
-            request.set_timeout(self.timeout);
+            request.set_timeout(timeout.unwrap_or(self.timeout));
 
             let response = self
                 .runtime
@@ -156,12 +164,14 @@ impl ManagerClient {
         })
     }
 
+    #[pyo3(signature = (rank, step, should_commit, timeout=None))]
     fn should_commit(
         &mut self,
         py: Python<'_>,
         rank: i64,
         step: i64,
         should_commit: bool,
+        timeout: Option<Duration>,
     ) -> PyResult<bool> {
         py.allow_threads(move || {
             let mut request = tonic::Request::new(ShouldCommitRequest {
@@ -171,7 +181,7 @@ impl ManagerClient {
             });
             // This notifies the server about the timeout but doesn't affect the
             // endpoint timeout which we set on client creation.
-            request.set_timeout(self.timeout);
+            request.set_timeout(timeout.unwrap_or(self.timeout));
 
             let response = self
                 .runtime
diff --git a/src/manager.rs b/src/manager.rs
@@ -180,9 +180,9 @@ impl ManagerService for Arc<Manager> {
         &self,
         request: Request<ManagerQuorumRequest>,
     ) -> Result<Response<ManagerQuorumResponse>, Status> {
-        let req = request.into_inner();
+        let req = request.get_ref();
         let rank = req.rank;
-        let room_id = req.room_id;
+        let room_id = &req.room_id;
 
         info!("{}: got quorum request for rank {}", room_id, rank);
 
@@ -195,7 +195,7 @@ impl ManagerService for Arc<Manager> {
                 .checkpoint_servers
                 .insert(req.rank, req.checkpoint_server_addr.clone());
 
-            if !state.rooms.contains_key(&room_id) {
+            if !state.rooms.contains_key(room_id) {
                 let (tx, _) = broadcast::channel(16);
 
                 state.rooms.insert(
@@ -207,7 +207,7 @@ impl ManagerService for Arc<Manager> {
                 );
             }
 
-            let room = state.rooms.get_mut(&room_id).unwrap();
+            let room = state.rooms.get_mut(room_id).unwrap();
 
             // TODO check step
             room.participants.insert(rank);
@@ -224,7 +224,7 @@ impl ManagerService for Arc<Manager> {
                     .await
                     .map_err(|e| Status::from_error(e.into()))?;
 
-                let request = tonic::Request::new(LighthouseQuorumRequest {
+                let mut lighthouse_request = tonic::Request::new(LighthouseQuorumRequest {
                     room_id: room_id.clone(),
                     requester: Some(QuorumMember {
                         replica_id: self.replica_id.clone(),
@@ -235,7 +235,16 @@ impl ManagerService for Arc<Manager> {
                     }),
                 });
 
-                let response = client.quorum(request).await.unwrap();
+                // propagate timeout from request to lighthouse
+                let timeout = request
+                    .metadata()
+                    .get("grpc-timeout")
+                    .ok_or_else(|| Status::internal("grpc-timeout not set"))?;
+                lighthouse_request
+                    .metadata_mut()
+                    .insert("grpc-timeout", timeout.clone());
+
+                let response = client.quorum(lighthouse_request).await.unwrap();
                 let resp = response.into_inner();
 
                 info!("{}: got lighthouse quorum {:?}", room_id, resp);
@@ -471,12 +480,13 @@ mod tests {
 
         let mut client = manager_client_new(manager.address(), Duration::from_secs(10)).await?;
 
-        let request = tonic::Request::new(ManagerQuorumRequest {
+        let mut request = tonic::Request::new(ManagerQuorumRequest {
             room_id: "room".to_string(),
             rank: 0,
             step: 123,
             checkpoint_server_addr: "addr".to_string(),
         });
+        request.set_timeout(Duration::from_secs(10));
         let resp = client.quorum(request).await?.into_inner();
 
         manager_fut.abort();
@@ -526,12 +536,13 @@ mod tests {
                 let mut client =
                     manager_client_new(manager.address(), Duration::from_secs(10)).await?;
 
-                let request = tonic::Request::new(ManagerQuorumRequest {
+                let mut request = tonic::Request::new(ManagerQuorumRequest {
                     room_id: "room".to_string(),
                     rank: 0,
                     step: 0,
                     checkpoint_server_addr: "addr".to_string(),
                 });
+                request.set_timeout(Duration::from_secs(10));
 
                 let result = client.quorum(request).await?.into_inner();
 
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -102,7 +102,10 @@ def __init__(
             min_replica_size: minimum number of replicas on each step
             port: if rank==0, the port to run the manager server on
             use_async_quorum: whether to run the quorum asynchronously during the forward pass
-            timeout: timeout for all operations
+            timeout:
+                the default timeout for all operation, if you're using per
+                request timeouts this should be longer than the longest request
+                timeout.
             rank: the replica group local rank
             world_size: the replica group local world size
             store_addr: TCPStore address for this replica group
@@ -279,7 +282,10 @@ def errored(self) -> Optional[Exception]:
         return self._errored
 
     def wrap_future(
-        self, fut: torch.futures.Future[T], default: T
+        self,
+        fut: torch.futures.Future[T],
+        default: T,
+        timeout: Optional[timedelta] = None,
     ) -> torch.futures.Future[T]:
         """
         Wrap a Future and swallow any errors that occur and report them to the manager.
@@ -289,10 +295,11 @@ def wrap_future(
         Args:
             fut: the Future to wrap
             default: the default value to complete the Future with if an error occurs
+            timeout: the timeout for the Future, if None, the manager's timeout will be used
         """
 
         # add a timeout to the future
-        fut = future_timeout(fut, self._timeout)
+        fut = future_timeout(fut, timeout or self._timeout)
 
         # schedule error handling as a continuation on the Future
         def callback(
@@ -313,7 +320,12 @@ def callback(
         self._pending_work.append(cast(torch.futures.Future[object], fut))
         return fut
 
-    def start_quorum(self, room_id: str = "default", allow_heal: bool = True) -> None:
+    def start_quorum(
+        self,
+        room_id: str = "default",
+        allow_heal: bool = True,
+        timeout: Optional[timedelta] = None,
+    ) -> None:
         """
         .. note::
             We recommend using the :py:class:`torchft.optim.OptimizerWrapper` instead of calling this directly.
@@ -331,6 +343,7 @@ def start_quorum(self, room_id: str = "default", allow_heal: bool = True) -> Non
                 calls. All replicas must pass the same value to allow_heal.
             room_id: (experimental) the room id to use for quorum, this allows
                 for multiple quorums to be used within the same job.
+            timeout: the timeout for quorum and recovery operations, if None, the manager's timeout will be used
         """
 
         # wait for previous quorum to complete
@@ -345,7 +358,10 @@ def start_quorum(self, room_id: str = "default", allow_heal: bool = True) -> Non
         # block to allow gracefully recovering from issues in PG setup and quorum.
 
         self._quorum_future = self._executor.submit(
-            self._async_quorum, room_id=room_id, allow_heal=allow_heal
+            self._async_quorum,
+            room_id=room_id,
+            allow_heal=allow_heal,
+            timeout=timeout or self._timeout,
         )
         if not self._use_async_quorum:
             self.wait_quorum()
@@ -369,7 +385,7 @@ def wait_quorum(self) -> None:
         ), "must call start_quorum before wait_quorum"
         self._quorum_future.result()
 
-    def _async_quorum(self, room_id: str, allow_heal: bool) -> None:
+    def _async_quorum(self, room_id: str, allow_heal: bool, timeout: timedelta) -> None:
         (
             quorum_id,
             replica_rank,
@@ -385,6 +401,7 @@ def _async_quorum(self, room_id: str, allow_heal: bool) -> None:
             rank=self._rank,
             step=self._step,
             checkpoint_server_addr=self._ckpt_server.address(),
+            timeout=timeout,
         )
 
         # When using async quorum we need to take the recovered workers.
@@ -422,8 +439,10 @@ def _async_quorum(self, room_id: str, allow_heal: bool) -> None:
             self._logger.info(
                 f"healing required, fetching checkpoint server address from {address=} {max_step=}"
             )
-            primary_client = ManagerClient(address, timeout=self._timeout)
-            checkpoint_server_address = primary_client.checkpoint_address(self._rank)
+            primary_client = ManagerClient(address, timeout=timeout)
+            checkpoint_server_address = primary_client.checkpoint_address(
+                self._rank, timeout=timeout
+            )
 
             self._logger.info(f"fetching checkpoint from {checkpoint_server_address=}")
             self._pending_state_dict = CheckpointServer.load_from_address(
diff --git a/torchft/torchft.pyi b/torchft/torchft.pyi
@@ -4,10 +4,23 @@ from typing import Optional, Tuple
 class ManagerClient:
     def __init__(self, addr: str, timeout: timedelta) -> None: ...
     def quorum(
-        self, room_id: str, rank: int, step: int, checkpoint_server_addr: str
+        self,
+        room_id: str,
+        rank: int,
+        step: int,
+        checkpoint_server_addr: str,
+        timeout: Optional[timedelta] = None,
     ) -> Tuple[int, int, int, str, str, int, Optional[int], int, bool]: ...
-    def checkpoint_address(self, rank: int) -> str: ...
-    def should_commit(self, rank: int, step: int, should_commit: bool) -> bool: ...
+    def checkpoint_address(
+        self, rank: int, timeout: Optional[timedelta] = None
+    ) -> str: ...
+    def should_commit(
+        self,
+        rank: int,
+        step: int,
+        should_commit: bool,
+        timeout: Optional[timedelta] = None,
+    ) -> bool: ...
 
 class Manager:
     def __init__(