From aec61c7688e7ea5214ae3795692c6a2f3498036d Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Sun, 10 Nov 2024 23:58:10 -0800 Subject: [PATCH] train, manager, dashboard: show world size on dashboard, manual replica_id, convergence tweaks (#11) --- proto/torchft.proto | 1 + src/lighthouse.rs | 7 +++++++ src/manager.rs | 1 + templates/status.html | 3 ++- torchft/manager.py | 7 +++++-- train_ddp.py | 12 ++++++++---- 6 files changed, 24 insertions(+), 7 deletions(-) diff --git a/proto/torchft.proto b/proto/torchft.proto index 1b0f51b..2513996 100644 --- a/proto/torchft.proto +++ b/proto/torchft.proto @@ -40,6 +40,7 @@ message QuorumMember { string address = 2; string store_address = 3; int64 step = 4; + uint64 world_size = 5; } message Quorum { diff --git a/src/lighthouse.rs b/src/lighthouse.rs index d6296b9..63a8682 100644 --- a/src/lighthouse.rs +++ b/src/lighthouse.rs @@ -463,6 +463,7 @@ mod tests { address: "".to_string(), store_address: "".to_string(), step: 1, + world_size: 1, }, }, ); @@ -495,6 +496,7 @@ mod tests { address: "".to_string(), store_address: "".to_string(), step: 1, + world_size: 1, }, }, ); @@ -511,6 +513,7 @@ mod tests { address: "".to_string(), store_address: "".to_string(), step: 1, + world_size: 1, }], created: Some(SystemTime::now().into()), }); @@ -550,6 +553,7 @@ mod tests { address: "".to_string(), store_address: "".to_string(), step: 10, + world_size: 1, }), }); @@ -568,12 +572,14 @@ mod tests { address: "".to_string(), store_address: "".to_string(), step: 1, + world_size: 1, }]; let b = vec![QuorumMember { replica_id: "1".to_string(), address: "changed".to_string(), store_address: "changed".to_string(), step: 1000, + world_size: 1, }]; // replica_id is the same @@ -584,6 +590,7 @@ mod tests { address: "".to_string(), store_address: "".to_string(), step: 1, + world_size: 1, }]; // replica_id changed assert!(quorum_changed(&a, &c)); diff --git a/src/manager.rs b/src/manager.rs index e42f10b..286e700 100644 --- a/src/manager.rs +++ b/src/manager.rs @@ -192,6 +192,7 @@ impl ManagerService for Arc { address: self.address.clone(), store_address: self.store_address.clone(), step: req.step, + world_size: self.world_size, }), }); diff --git a/templates/status.html b/templates/status.html index 11f2877..429419d 100644 --- a/templates/status.html +++ b/templates/status.html @@ -19,7 +19,8 @@

Previous Quorum

{{ member.replica_id }}
Step: {{ member.step }}
Manager: {{ member.address }}
- TCPStore: {{ member.store_address }} + TCPStore: {{ member.store_address }}
+ World size: {{ member.world_size }}