use torchx for manual many replica (20+) tests

d4l3k · d4l3k · commit e36c99f23648 · 2025-01-16T14:45:35.000-08:00
diff --git a/.pyre_configuration b/.pyre_configuration
@@ -6,5 +6,9 @@
       "import_root": ".",
       "source": "torchft"
     }
+  ],
+  "search_path": [
+    {"site-package": "torchx"},
+    {"site-package": "parameterized"}
   ]
 }
diff --git a/.torchxconfig b/.torchxconfig
@@ -0,0 +1,3 @@
+[cli:run]
+component=torchft/torchx.py:hsdp
+scheduler=local_cwd
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,8 @@ dev = [
     "pyre-check",
     "parameterized",
     "expecttest",
-    "numpy"
+    "numpy",
+    "torchx"
 ]
 
 [tool.maturin]
diff --git a/src/lib.rs b/src/lib.rs
@@ -302,11 +302,16 @@ impl From<Status> for StatusError {
 #[pymodule]
 fn torchft(m: &Bound<'_, PyModule>) -> PyResult<()> {
     // setup logging on import
-    stderrlog::new()
-        .verbosity(2)
+    let mut log = stderrlog::new();
+    log.verbosity(2)
         .show_module_names(true)
-        .timestamp(stderrlog::Timestamp::Millisecond)
-        .init()
+        .timestamp(stderrlog::Timestamp::Millisecond);
+
+    if env::var("CLICOLOR_FORCE").is_ok() {
+        log.color(stderrlog::ColorChoice::AlwaysAnsi);
+    }
+
+    log.init()
         .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
 
     m.add_class::<Manager>()?;
diff --git a/src/lighthouse.rs b/src/lighthouse.rs
@@ -377,21 +377,26 @@ impl Lighthouse {
 
             let (_, quorum_status) = quorum_compute(Instant::now(), &state, &self.opt);
 
-            let max_step = {
-                if let Some(quorum) = state.prev_quorum.clone() {
-                    quorum
-                        .participants
-                        .iter()
-                        .map(|p| p.step)
-                        .max()
-                        .unwrap_or(-1)
-                } else {
-                    -1
-                }
+            let max_step = if let Some(quorum) = &state.prev_quorum {
+                quorum
+                    .participants
+                    .iter()
+                    .map(|p| p.step)
+                    .max()
+                    .unwrap_or(-1)
+            } else {
+                -1
+            };
+
+            let num_participants = if let Some(quorum) = &state.prev_quorum {
+                quorum.participants.len() as i64
+            } else {
+                -1
             };
 
             StatusTemplate {
                 quorum_id: state.quorum_id,
+                num_participants: num_participants,
                 prev_quorum: state.prev_quorum.clone(),
                 quorum_status: quorum_status,
                 max_step: max_step,
@@ -527,6 +532,7 @@ struct StatusTemplate {
     prev_quorum: Option<Quorum>,
     quorum_id: i64,
     quorum_status: String,
+    num_participants: i64,
     max_step: i64,
     heartbeats: HashMap<String, Instant>,
 
diff --git a/templates/status.html b/templates/status.html
@@ -6,6 +6,7 @@ <h3>Previous Quorum</h3>
 {% if let Some(prev_quorum) = prev_quorum %}
 
 Previous quorum id: {{prev_quorum.quorum_id}} <br>
+Num participants: {{num_participants}} <br>
 Quorum age:
 {{SystemTime::try_from(prev_quorum.created.unwrap()).unwrap().elapsed().unwrap().as_secs_f64()}}s
 
diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py
@@ -10,8 +10,6 @@
 
 import torch
 import torch.distributed as dist
-
-# pyre-fixme[21]: missing module
 from parameterized import parameterized
 from torch import nn, optim
 
@@ -292,7 +290,6 @@ def test_ddp_healthy(self) -> None:
         for state_dict in state_dicts:
             torch.testing.assert_close(state_dict, state_dicts[0])
 
-    # pyre-fixme[56]: couldn't infer type of decorator
     @parameterized.expand(
         [
             (
diff --git a/torchft/torchx.py b/torchft/torchx.py
@@ -0,0 +1,76 @@
+"""
+This is a file for TorchX components used for testing torchft.
+"""
+
+import os
+from typing import Dict, Optional
+
+import torchx.specs as specs
+
+
+def hsdp(
+    *script_args: str,
+    replicas: int = 2,
+    workers_per_replica: int = 1,
+    max_restarts: int = 10,
+    script: str = "train_ddp.py",
+    env: Optional[Dict[str, str]] = None,
+    image: str = "",
+    h: Optional[str] = None,
+    cpu: int = 2,
+    gpu: int = 0,
+    memMB: int = 1024,
+) -> specs.AppDef:
+    assert replicas > 0, "replicas must be > 0"
+    assert workers_per_replica > 0, "workers_per_replica must be > 0"
+
+    env = env or {}
+
+    # Enable logging for PyTorch, torchelastic and Rust.
+    env.setdefault("TORCH_CPP_LOG_LEVEL", "INFO")
+    env.setdefault("LOGLEVEL", "INFO")
+    env.setdefault("RUST_BACKTRACE", "1")
+
+    # Enable colored logging for torchft Rust logger.
+    env.setdefault("CLICOLOR_FORCE", "1")
+
+    # Set lighthouse address for replicas
+    # This must be run externally
+    env.setdefault(
+        "TORCHFT_LIGHTHOUSE",
+        os.environ.get("TORCHFT_LIGHTHOUSE", f"http://localhost:29510"),
+    )
+
+    # Disable CUDA for CPU-only jobs
+    env.setdefault("CUDA_VISIBLE_DEVICES", "")
+
+    roles = []
+    for replica_id in range(replicas):
+        cmd = [
+            f"--master_port={29600+replica_id}",
+            "--nnodes=1",
+            f"--nproc_per_node={workers_per_replica}",
+            f"--max_restarts={max_restarts}",
+        ]
+        if script:
+            cmd += [script]
+        cmd += list(script_args)
+
+        roles.append(
+            specs.Role(
+                name=f"replica_{replica_id}",
+                image=image,
+                min_replicas=workers_per_replica,
+                num_replicas=workers_per_replica,
+                resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
+                max_retries=0,
+                env=env,
+                entrypoint="torchrun",
+                args=cmd,
+            )
+        )
+
+    return specs.AppDef(
+        name="torchft",
+        roles=roles,
+    )
diff --git a/train_ddp.py b/train_ddp.py
@@ -13,6 +13,7 @@
 import torchvision
 import torchvision.transforms as transforms
 from torch import nn, optim
+from torch.distributed.elastic.multiprocessing.errors import record
 from torchdata.stateful_dataloader import StatefulDataLoader
 
 from torchft import (
@@ -27,6 +28,7 @@
 logging.basicConfig(level=logging.INFO)
 
 
+@record
 def main() -> None:
     REPLICA_GROUP_ID = int(os.environ.get("REPLICA_GROUP_ID", 0))
     NUM_REPLICA_GROUPS = int(os.environ.get("NUM_REPLICA_GROUPS", 2))

Original file line number	Diff line number	Diff line change
`@@ -6,5 +6,9 @@`
`6`	`6`	`"import_root": ".",`
`7`	`7`	`"source": "torchft"`
`8`	`8`	`}`
	`9`	`+ ],`
	`10`	`+ "search_path": [`
	`11`	`+ {"site-package": "torchx"},`
	`12`	`+ {"site-package": "parameterized"}`
`9`	`13`	`]`
`10`	`14`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[cli:run]`
	`2`	`+component=torchft/torchx.py:hsdp`
	`3`	`+scheduler=local_cwd`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,8 @@ dev = [`
`27`	`27`	`"pyre-check",`
`28`	`28`	`"parameterized",`
`29`	`29`	`"expecttest",`
`30`		`- "numpy"`
	`30`	`+ "numpy",`
	`31`	`+ "torchx"`
`31`	`32`	`]`
`32`	`33`
`33`	`34`	`[tool.maturin]`
Original file line number	Diff line number	Diff line change
`@@ -10,8 +10,6 @@`
`10`	`10`
`11`	`11`	`import torch`
`12`	`12`	`import torch.distributed as dist`
`13`		`-`
`14`		`-# pyre-fixme[21]: missing module`
`15`	`13`	`from parameterized import parameterized`
`16`	`14`	`from torch import nn, optim`
`17`	`15`
`@@ -292,7 +290,6 @@ def test_ddp_healthy(self) -> None:`
`292`	`290`	`for state_dict in state_dicts:`
`293`	`291`	`torch.testing.assert_close(state_dict, state_dicts[0])`
`294`	`292`
`295`		`- # pyre-fixme[56]: couldn't infer type of decorator`
`296`	`293`	`@parameterized.expand(`
`297`	`294`	`[`
`298`	`295`	`(`