@@ -156,6 +156,81 @@ static void xnet_set_no_port(SOCKET sock)
156
156
#define xnet_set_no_port (sock )
157
157
#endif
158
158
159
+ int
160
+ xnet_disable_keepalive (struct xnet_ep * ep )
161
+ {
162
+ int optval = 0 ;
163
+ int ret ;
164
+
165
+ ret = setsockopt (ep -> bsock .sock , SOL_SOCKET , SO_KEEPALIVE , (char * )& optval ,
166
+ sizeof (optval ));
167
+ if (ret ) {
168
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "set SO_KEEPALIVE failed %d" , ret );
169
+ return - ofi_sockerr ();
170
+ }
171
+
172
+ FI_INFO (& xnet_prov , FI_LOG_EP_CTRL , "ep %p KEEPALIVE is disabled.\n" , ep );
173
+ return ret ;
174
+ }
175
+
176
+ static int
177
+ xnet_enable_keepalive (struct xnet_ep * ep )
178
+ {
179
+ int optval = 1 ;
180
+ int idle_time = 5 ;
181
+ int keep_intvl = 2 ;
182
+ int keep_cnt = 2 ;
183
+ int ret ;
184
+
185
+ ret = setsockopt (ep -> bsock .sock , SOL_SOCKET , SO_KEEPALIVE , (const void * )& optval ,
186
+ sizeof (optval ));
187
+ if (ret ) {
188
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "set SO_KEEPALIVE failed %d" , ret );
189
+ return - ofi_sockerr ();
190
+ }
191
+
192
+ #ifdef MACOS
193
+ ret = setsockopt (ep -> bsock .sock , IPPROTO_TCP , TCP_KEEPALIVE , (const void * )& idle_time ,
194
+ sizeof (idle_time ));
195
+ #else
196
+ ret = setsockopt (ep -> bsock .sock , IPPROTO_TCP , TCP_KEEPIDLE , (const void * )& idle_time ,
197
+ sizeof (idle_time ));
198
+ #endif
199
+ if (ret ) {
200
+ ret = - ofi_sockerr ();
201
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "set TCP_KEEPIDLE failed %d" , ret );
202
+ goto out ;
203
+ }
204
+
205
+ ret = setsockopt (ep -> bsock .sock , IPPROTO_TCP , TCP_KEEPINTVL , (const void * )& keep_intvl ,
206
+ sizeof (keep_intvl ));
207
+ if (ret ) {
208
+ ret = - ofi_sockerr ();
209
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "set TCP_KEEPINTVL failed %d" , ret );
210
+ goto out ;
211
+ }
212
+
213
+ ret = setsockopt (ep -> bsock .sock , IPPROTO_TCP , TCP_KEEPCNT , (const void * )& keep_cnt ,
214
+ sizeof (keep_cnt ));
215
+ if (ret ) {
216
+ ret = - ofi_sockerr ();
217
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "set SO_KEEPALIVE failed %d" , ret );
218
+ goto out ;
219
+ }
220
+
221
+ FI_INFO (& xnet_prov , FI_LOG_EP_CTRL , "%p KEEPALIVE idle %d intvl %d cnt %d\n" ,
222
+ ep , idle_time , keep_intvl , keep_cnt );
223
+
224
+ out :
225
+ if (ret ) {
226
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "%p KEEPALIVE set keepalive failed %d\n" ,
227
+ ep , ret );
228
+ xnet_disable_keepalive (ep );
229
+ }
230
+
231
+ return ret ;
232
+ }
233
+
159
234
int xnet_setup_socket (SOCKET sock , struct fi_info * info )
160
235
{
161
236
int ret , optval = 1 ;
@@ -294,6 +369,16 @@ xnet_ep_accept(struct fid_ep *ep_fid, const void *param, size_t paramlen)
294
369
ep -> cm_msg -> hdr .seg_size = htons ((uint16_t ) paramlen );
295
370
}
296
371
372
+ /* Enable keepalive to make sure the socket status can be reset in time
373
+ * if the remote peer is restarted after it gets connreq but not replies.
374
+ */
375
+ ret = xnet_enable_keepalive (ep );
376
+ if (ret ) {
377
+ FI_WARN (& xnet_prov , FI_LOG_EP_CTRL , "%p set tcp keepalive failure:%d\n" ,
378
+ ep , ret );
379
+ return ofi_sockerr () ? - ofi_sockerr () : - FI_EINVAL ;
380
+ }
381
+
297
382
ret = xnet_send_cm_msg (ep );
298
383
if (ret )
299
384
return ret ;
0 commit comments