Skip to content

Commit 79572e6

Browse files
authored
lighthouse/quorum: avoid split brain and add shrink_only support (#71)
1 parent 97ad397 commit 79572e6

File tree

7 files changed

+261
-61
lines changed

7 files changed

+261
-61
lines changed

proto/torchft.proto

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ message QuorumMember {
4141
string store_address = 3;
4242
int64 step = 4;
4343
uint64 world_size = 5;
44+
bool shrink_only = 6;
4445
}
4546

4647
message Quorum {
@@ -72,6 +73,7 @@ message ManagerQuorumRequest {
7273
int64 rank = 1;
7374
int64 step = 2;
7475
string checkpoint_server_addr = 3;
76+
bool shrink_only = 4;
7577
}
7678

7779
message ManagerQuorumResponse {

src/lib.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -105,20 +105,22 @@ impl ManagerClient {
105105
})
106106
}
107107

108-
#[pyo3(signature = (rank, step, checkpoint_server_addr, timeout=None))]
108+
#[pyo3(signature = (rank, step, checkpoint_server_addr, shrink_only, timeout=None))]
109109
fn quorum(
110110
&mut self,
111111
py: Python<'_>,
112112
rank: i64,
113113
step: i64,
114114
checkpoint_server_addr: String,
115+
shrink_only: bool,
115116
timeout: Option<Duration>,
116117
) -> Result<(i64, i64, i64, String, String, i64, Option<i64>, i64, bool), StatusError> {
117118
py.allow_threads(move || {
118119
let mut request = tonic::Request::new(ManagerQuorumRequest {
119120
rank: rank,
120121
step: step,
121122
checkpoint_server_addr: checkpoint_server_addr,
123+
shrink_only: shrink_only,
122124
});
123125
// This notifies the server about the timeout but doesn't affect the
124126
// endpoint timeout which we set on client creation.

0 commit comments

Comments
 (0)