Skip to content

Commit 8247faf

Browse files
authored
lighthouse, manager: support multiple quorum rooms (#48)
1 parent 49d2aec commit 8247faf

File tree

7 files changed

+287
-205
lines changed

7 files changed

+287
-205
lines changed

proto/torchft.proto

+12-4
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,11 @@ message Quorum {
5050
}
5151

5252
message LighthouseQuorumRequest {
53-
QuorumMember requester = 1;
53+
// room_id is the specific quorum channel to use. All workers/replicas
54+
// participating in the quorum must specify the same channel.
55+
// Multiple channels can be active simultaneously.
56+
string room_id = 1;
57+
QuorumMember requester = 2;
5458
}
5559

5660
message LighthouseQuorumResponse {
@@ -69,9 +73,13 @@ service LighthouseService {
6973
}
7074

7175
message ManagerQuorumRequest {
72-
int64 rank = 1;
73-
int64 step = 2;
74-
string checkpoint_server_addr = 3;
76+
// room_id is the specific quorum channel to use. All workers/replicas
77+
// participating in the quorum must specify the same channel.
78+
// Multiple channels can be active simultaneously.
79+
string room_id = 1;
80+
int64 rank = 2;
81+
int64 step = 3;
82+
string checkpoint_server_addr = 4;
7583
}
7684

7785
message ManagerQuorumResponse {

src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,14 @@ impl ManagerClient {
105105
fn quorum(
106106
&mut self,
107107
py: Python<'_>,
108+
room_id: String,
108109
rank: i64,
109110
step: i64,
110111
checkpoint_server_addr: String,
111112
) -> PyResult<(i64, i64, i64, String, String, i64, Option<i64>, i64, bool)> {
112113
py.allow_threads(move || {
113114
let mut request = tonic::Request::new(ManagerQuorumRequest {
115+
room_id: room_id,
114116
rank: rank,
115117
step: step,
116118
checkpoint_server_addr: checkpoint_server_addr,

0 commit comments

Comments
 (0)