pytorch
diff --git a/‎.pyre_configuration
Lines changed: 4 additions & 0 deletions b/‎.pyre_configuration
Lines changed: 4 additions & 0 deletions
diff --git a/‎.torchxconfig
Lines changed: 3 additions & 0 deletions b/‎.torchxconfig
Lines changed: 3 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 15 additions & 0 deletions b/‎CONTRIBUTING.md
Lines changed: 15 additions & 0 deletions
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎output.txt
Lines changed: 616 additions & 0 deletions b/‎output.txt
Lines changed: 616 additions & 0 deletions
diff --git a/‎proto/torchft.proto
Lines changed: 2 additions & 0 deletions b/‎proto/torchft.proto
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/lib.rs
Lines changed: 39 additions & 27 deletions b/‎src/lib.rs
Lines changed: 39 additions & 27 deletions
@@ -6,5 +6,9 @@
       "import_root": ".",
       "source": "torchft"
     }
+  ],
+  "search_path": [
+    {"site-package": "torchx"},
+    {"site-package": "parameterized"}
   ]
 }
@@ -0,0 +1,3 @@
+[cli:run]
+component=torchft/torchx.py:hsdp
+scheduler=local_cwd
@@ -96,6 +96,21 @@ make livehtml
 The docs will be built in the `docs/build/html` directory and served at http://localhost:8000.
 The page will be automatically re-built as long as the process is kept running.
 
+### Running Multiple Replica Local Job
+
+We use torchx to run multiple worker local test jobs. You need to run the
+lighthouse first and then you can use torchx to launch as many replica groups as
+you want. This uses the [train_ddp.py](./train_ddp.py) script.
+
+```sh
+$ torchft_lighthouse --min_replicas 2 --join_timeout_ms 10000 &
+$ torchx run -- --replicas 10
+```
+
+Once the Lighthouse has started you can view the status of all the workers at the Lighthouse dashboard.
+
+Default address is: http://localhost:29510
+
 ## Contributor License Agreement ("CLA")
 
 In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of
 
@@ -12,6 +12,7 @@ log = "0.4.22"
 prost = "0.13.3"
 prost-types = "0.13.3"
 pyo3 = {version="0.22.3", features = ["extension-module"]}
+rand = "0.8.5"
 slog = "2.7.0"
 slog-stdlog = "4.1.1"
 stderrlog = "0.6.0"
 
@@ -52,7 +52,7 @@ greatly improve efficiency by avoiding stop the world training on errors.
 
 Before proceeding, ensure you have the following installed:
 
-- Rust (with necessaray dependencies)
+- Rust (with necessary dependencies)
 - `protobuf-compiler` and the corresponding development package for Protobuf.
 
 Note that the Rust versions available in many conda environments may be outdated. To install the latest version of Rust, we recommend downloading it directly from the official website as shown in the below command:
 
@@ -41,6 +41,7 @@ message QuorumMember {
     string store_address = 3;
     int64 step = 4;
     uint64 world_size = 5;
+    bool shrink_only = 6;
 }
 
 message Quorum {
@@ -72,6 +73,7 @@ message ManagerQuorumRequest {
     int64 rank = 1;
     int64 step = 2;
     string checkpoint_server_addr = 3;
+    bool shrink_only = 4;
 }
 
 message ManagerQuorumResponse {
 
@@ -27,7 +27,8 @@ dev = [
     "pyre-check",
     "parameterized",
     "expecttest",
-    "numpy"
+    "numpy",
+    "torchx"
 ]
 
 [tool.maturin]
 
@@ -6,6 +6,9 @@
 
 pub mod lighthouse;
 pub mod manager;
+mod net;
+mod retry;
+mod timeout;
 
 use core::time::Duration;
 use std::env;
@@ -46,6 +49,7 @@ impl Manager {
         store_addr: String,
         world_size: u64,
         heartbeat_interval: Duration,
+        connect_timeout: Duration,
     ) -> PyResult<Self> {
         py.allow_threads(move || {
             let runtime = Runtime::new()?;
@@ -58,6 +62,7 @@ impl Manager {
                     store_addr,
                     world_size,
                     heartbeat_interval,
+                    connect_timeout,
                 ))
                 .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
             let handle = runtime.spawn(manager.clone().run());
@@ -84,47 +89,47 @@ impl Manager {
 struct ManagerClient {
     runtime: Runtime,
     client: ManagerServiceClient<Channel>,
-    timeout: Duration,
 }
 
 #[pymethods]
 impl ManagerClient {
     #[new]
-    fn new(py: Python<'_>, addr: String, timeout: Duration) -> PyResult<Self> {
+    fn new(py: Python<'_>, addr: String, connect_timeout: Duration) -> PyResult<Self> {
         py.allow_threads(move || {
             let runtime = Runtime::new()?;
             let client = runtime
-                .block_on(manager::manager_client_new(addr, timeout))
+                .block_on(manager::manager_client_new(addr, connect_timeout))
                 .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
 
             Ok(Self {
                 runtime: runtime,
                 client: client,
-                timeout: timeout,
             })
         })
     }
 
-    #[pyo3(signature = (rank, step, checkpoint_server_addr, timeout=None))]
     fn quorum(
-        &mut self,
+        &self,
         py: Python<'_>,
         rank: i64,
         step: i64,
         checkpoint_server_addr: String,
-        timeout: Option<Duration>,
+        shrink_only: bool,
+        timeout: Duration,
     ) -> Result<(i64, i64, i64, String, String, i64, Option<i64>, i64, bool), StatusError> {
         py.allow_threads(move || {
             let mut request = tonic::Request::new(ManagerQuorumRequest {
                 rank: rank,
                 step: step,
                 checkpoint_server_addr: checkpoint_server_addr,
+                shrink_only: shrink_only,
             });
-            // This notifies the server about the timeout but doesn't affect the
-            // endpoint timeout which we set on client creation.
-            request.set_timeout(timeout.unwrap_or(self.timeout));
 
-            let response = self.runtime.block_on(self.client.quorum(request))?;
+            // This timeout is processed on the server side so we also enable
+            // keep alives to detect server health.
+            request.set_timeout(timeout);
+
+            let response = self.runtime.block_on(self.client.clone().quorum(request))?;
             let resp = response.into_inner();
             Ok((
                 resp.quorum_id,
@@ -140,47 +145,49 @@ impl ManagerClient {
         })
     }
 
-    #[pyo3(signature = (rank, timeout=None))]
     fn checkpoint_address(
-        &mut self,
+        &self,
         py: Python<'_>,
         rank: i64,
-        timeout: Option<Duration>,
+        timeout: Duration,
     ) -> Result<String, StatusError> {
         py.allow_threads(move || {
             let mut request = tonic::Request::new(CheckpointAddressRequest { rank: rank });
-            // This notifies the server about the timeout but doesn't affect the
-            // endpoint timeout which we set on client creation.
-            request.set_timeout(timeout.unwrap_or(self.timeout));
+
+            // This timeout is processed on the server side so we also enable
+            // keep alives to detect server health.
+            request.set_timeout(timeout);
 
             let response = self
                 .runtime
-                .block_on(self.client.checkpoint_address(request))?;
+                .block_on(self.client.clone().checkpoint_address(request))?;
             let resp = response.into_inner();
             Ok(resp.checkpoint_server_address)
         })
     }
 
-    #[pyo3(signature = (rank, step, should_commit, timeout=None))]
     fn should_commit(
-        &mut self,
+        &self,
         py: Python<'_>,
         rank: i64,
         step: i64,
         should_commit: bool,
-        timeout: Option<Duration>,
+        timeout: Duration,
     ) -> Result<bool, StatusError> {
         py.allow_threads(move || {
             let mut request = tonic::Request::new(ShouldCommitRequest {
                 rank: rank,
                 step: step,
                 should_commit: should_commit,
             });
+
             // This notifies the server about the timeout but doesn't affect the
             // endpoint timeout which we set on client creation.
-            request.set_timeout(timeout.unwrap_or(self.timeout));
+            request.set_timeout(timeout);
 
-            let response = self.runtime.block_on(self.client.should_commit(request))?;
+            let response = self
+                .runtime
+                .block_on(self.client.clone().should_commit(request))?;
             let resp = response.into_inner();
             Ok(resp.should_commit)
         })
@@ -297,11 +304,16 @@ impl From<Status> for StatusError {
 #[pymodule]
 fn torchft(m: &Bound<'_, PyModule>) -> PyResult<()> {
     // setup logging on import
-    stderrlog::new()
-        .verbosity(2)
+    let mut log = stderrlog::new();
+    log.verbosity(2)
         .show_module_names(true)
-        .timestamp(stderrlog::Timestamp::Millisecond)
-        .init()
+        .timestamp(stderrlog::Timestamp::Millisecond);
+
+    if env::var("CLICOLOR_FORCE").is_ok() {
+        log.color(stderrlog::ColorChoice::AlwaysAnsi);
+    }
+
+    log.init()
         .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
 
     m.add_class::<Manager>()?;
Original file line number	Diff line number	Diff line change
`@@ -6,5 +6,9 @@`
`6`	`6`	`"import_root": ".",`
`7`	`7`	`"source": "torchft"`
`8`	`8`	`}`
	`9`	`+ ],`
	`10`	`+ "search_path": [`
	`11`	`+ {"site-package": "torchx"},`
	`12`	`+ {"site-package": "parameterized"}`
`9`	`13`	`]`
`10`	`14`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[cli:run]`
	`2`	`+component=torchft/torchx.py:hsdp`
	`3`	`+scheduler=local_cwd`
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ message QuorumMember {`
`41`	`41`	`string store_address = 3;`
`42`	`42`	`int64 step = 4;`
`43`	`43`	`uint64 world_size = 5;`
	`44`	`+ bool shrink_only = 6;`
`44`	`45`	`}`
`45`	`46`
`46`	`47`	`message Quorum {`
`@@ -72,6 +73,7 @@ message ManagerQuorumRequest {`
`72`	`73`	`int64 rank = 1;`
`73`	`74`	`int64 step = 2;`
`74`	`75`	`string checkpoint_server_addr = 3;`
	`76`	`+ bool shrink_only = 4;`
`75`	`77`	`}`
`76`	`78`
`77`	`79`	`message ManagerQuorumResponse {`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,8 @@ dev = [`
`27`	`27`	`"pyre-check",`
`28`	`28`	`"parameterized",`
`29`	`29`	`"expecttest",`
`30`		`- "numpy"`
	`30`	`+ "numpy",`
	`31`	`+ "torchx"`
`31`	`32`	`]`
`32`	`33`
`33`	`34`	`[tool.maturin]`