6
6
7
7
pub mod lighthouse;
8
8
pub mod manager;
9
+ mod net;
10
+ mod retry;
11
+ mod timeout;
9
12
10
13
use core:: time:: Duration ;
11
14
use std:: env;
@@ -46,6 +49,7 @@ impl Manager {
46
49
store_addr : String ,
47
50
world_size : u64 ,
48
51
heartbeat_interval : Duration ,
52
+ connect_timeout : Duration ,
49
53
) -> PyResult < Self > {
50
54
py. allow_threads ( move || {
51
55
let runtime = Runtime :: new ( ) ?;
@@ -58,6 +62,7 @@ impl Manager {
58
62
store_addr,
59
63
world_size,
60
64
heartbeat_interval,
65
+ connect_timeout,
61
66
) )
62
67
. map_err ( |e| PyRuntimeError :: new_err ( e. to_string ( ) ) ) ?;
63
68
let handle = runtime. spawn ( manager. clone ( ) . run ( ) ) ;
@@ -84,36 +89,33 @@ impl Manager {
84
89
struct ManagerClient {
85
90
runtime : Runtime ,
86
91
client : ManagerServiceClient < Channel > ,
87
- timeout : Duration ,
88
92
}
89
93
90
94
#[ pymethods]
91
95
impl ManagerClient {
92
96
#[ new]
93
- fn new ( py : Python < ' _ > , addr : String , timeout : Duration ) -> PyResult < Self > {
97
+ fn new ( py : Python < ' _ > , addr : String , connect_timeout : Duration ) -> PyResult < Self > {
94
98
py. allow_threads ( move || {
95
99
let runtime = Runtime :: new ( ) ?;
96
100
let client = runtime
97
- . block_on ( manager:: manager_client_new ( addr, timeout ) )
101
+ . block_on ( manager:: manager_client_new ( addr, connect_timeout ) )
98
102
. map_err ( |e| PyRuntimeError :: new_err ( e. to_string ( ) ) ) ?;
99
103
100
104
Ok ( Self {
101
105
runtime : runtime,
102
106
client : client,
103
- timeout : timeout,
104
107
} )
105
108
} )
106
109
}
107
110
108
- #[ pyo3( signature = ( rank, step, checkpoint_server_addr, shrink_only, timeout=None ) ) ]
109
111
fn quorum (
110
112
& mut self ,
111
113
py : Python < ' _ > ,
112
114
rank : i64 ,
113
115
step : i64 ,
114
116
checkpoint_server_addr : String ,
115
117
shrink_only : bool ,
116
- timeout : Option < Duration > ,
118
+ timeout : Duration ,
117
119
) -> Result < ( i64 , i64 , i64 , String , String , i64 , Option < i64 > , i64 , bool ) , StatusError > {
118
120
py. allow_threads ( move || {
119
121
let mut request = tonic:: Request :: new ( ManagerQuorumRequest {
@@ -122,9 +124,10 @@ impl ManagerClient {
122
124
checkpoint_server_addr : checkpoint_server_addr,
123
125
shrink_only : shrink_only,
124
126
} ) ;
125
- // This notifies the server about the timeout but doesn't affect the
126
- // endpoint timeout which we set on client creation.
127
- request. set_timeout ( timeout. unwrap_or ( self . timeout ) ) ;
127
+
128
+ // This timeout is processed on the server side so we also enable
129
+ // keep alives to detect server health.
130
+ request. set_timeout ( timeout) ;
128
131
129
132
let response = self . runtime . block_on ( self . client . quorum ( request) ) ?;
130
133
let resp = response. into_inner ( ) ;
@@ -142,18 +145,18 @@ impl ManagerClient {
142
145
} )
143
146
}
144
147
145
- #[ pyo3( signature = ( rank, timeout=None ) ) ]
146
148
fn checkpoint_address (
147
149
& mut self ,
148
150
py : Python < ' _ > ,
149
151
rank : i64 ,
150
- timeout : Option < Duration > ,
152
+ timeout : Duration ,
151
153
) -> Result < String , StatusError > {
152
154
py. allow_threads ( move || {
153
155
let mut request = tonic:: Request :: new ( CheckpointAddressRequest { rank : rank } ) ;
154
- // This notifies the server about the timeout but doesn't affect the
155
- // endpoint timeout which we set on client creation.
156
- request. set_timeout ( timeout. unwrap_or ( self . timeout ) ) ;
156
+
157
+ // This timeout is processed on the server side so we also enable
158
+ // keep alives to detect server health.
159
+ request. set_timeout ( timeout) ;
157
160
158
161
let response = self
159
162
. runtime
@@ -163,24 +166,24 @@ impl ManagerClient {
163
166
} )
164
167
}
165
168
166
- #[ pyo3( signature = ( rank, step, should_commit, timeout=None ) ) ]
167
169
fn should_commit (
168
170
& mut self ,
169
171
py : Python < ' _ > ,
170
172
rank : i64 ,
171
173
step : i64 ,
172
174
should_commit : bool ,
173
- timeout : Option < Duration > ,
175
+ timeout : Duration ,
174
176
) -> Result < bool , StatusError > {
175
177
py. allow_threads ( move || {
176
178
let mut request = tonic:: Request :: new ( ShouldCommitRequest {
177
179
rank : rank,
178
180
step : step,
179
181
should_commit : should_commit,
180
182
} ) ;
183
+
181
184
// This notifies the server about the timeout but doesn't affect the
182
185
// endpoint timeout which we set on client creation.
183
- request. set_timeout ( timeout. unwrap_or ( self . timeout ) ) ;
186
+ request. set_timeout ( timeout) ;
184
187
185
188
let response = self . runtime . block_on ( self . client . should_commit ( request) ) ?;
186
189
let resp = response. into_inner ( ) ;
0 commit comments