Skip to content

Commit d69293d

Browse files
committed
Enable TCP keepalive during connreq processing, otherwise
if the remote peer is restarted between getting connreq and replying, it may hang there to wait for the connreq reply. Since the ep state(req_done) will not allow sending new reqs, and socket state will not be reset, i.e. progress will not be able to detect the disconnection. Signed-off-by: Di Wang <[email protected]>
1 parent 3d04127 commit d69293d

File tree

4 files changed

+93
-0
lines changed

4 files changed

+93
-0
lines changed

prov/tcp/Makefile.include

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,8 @@ prov_install_man_pages += man/man7/fi_tcp.7
3535

3636
endif HAVE_TCP
3737

38+
if MACOS
39+
AM_CPPFLAGS += -DMACOS
40+
endif MACOS
41+
3842
prov_dist_man_pages += man/man7/fi_tcp.7

prov/tcp/src/xnet.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,4 +781,5 @@ int xnet_rdm_ops_open(struct fid *fid, const char *name,
781781
FI_WARN(&xnet_prov, subsystem, log_str "%s (%d)\n", \
782782
fi_strerror((int) -(err)), (int) err)
783783

784+
int xnet_disable_keepalive(struct xnet_ep *ep);
784785
#endif //_XNET_H_

prov/tcp/src/xnet_cm.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,12 @@ void xnet_req_done(struct xnet_ep *ep)
203203
ret = xnet_req_done_internal(ep);
204204
if (ret)
205205
goto disable;
206+
207+
xnet_disable_keepalive(ep);
206208
return;
207209

208210
disable:
211+
xnet_disable_keepalive(ep);
209212
xnet_ep_disable(ep, -ret, ep->cm_msg->data,
210213
ntohs(ep->cm_msg->hdr.seg_size));
211214
}

prov/tcp/src/xnet_ep.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,81 @@ static void xnet_set_no_port(SOCKET sock)
156156
#define xnet_set_no_port(sock)
157157
#endif
158158

159+
int
160+
xnet_disable_keepalive(struct xnet_ep *ep)
161+
{
162+
int optval = 0;
163+
int ret;
164+
165+
ret = setsockopt(ep->bsock.sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&optval,
166+
sizeof(optval));
167+
if (ret) {
168+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set SO_KEEPALIVE failed %d", ret);
169+
return -ofi_sockerr();
170+
}
171+
172+
FI_INFO(&xnet_prov, FI_LOG_EP_CTRL, "ep %p KEEPALIVE is disabled.\n", ep);
173+
return ret;
174+
}
175+
176+
static int
177+
xnet_enable_keepalive(struct xnet_ep *ep)
178+
{
179+
int optval = 1;
180+
int idle_time = 5;
181+
int keep_intvl = 2;
182+
int keep_cnt = 2;
183+
int ret;
184+
185+
ret = setsockopt(ep->bsock.sock, SOL_SOCKET, SO_KEEPALIVE, (const void *)&optval,
186+
sizeof(optval));
187+
if (ret) {
188+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set SO_KEEPALIVE failed %d", ret);
189+
return -ofi_sockerr();
190+
}
191+
192+
#ifdef MACOS
193+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPALIVE, (const void *)&idle_time,
194+
sizeof(idle_time));
195+
#else
196+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPIDLE, (const void *)&idle_time,
197+
sizeof(idle_time));
198+
#endif
199+
if (ret) {
200+
ret = -ofi_sockerr();
201+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set TCP_KEEPIDLE failed %d", ret);
202+
goto out;
203+
}
204+
205+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPINTVL, (const void *)&keep_intvl,
206+
sizeof(keep_intvl));
207+
if (ret) {
208+
ret = -ofi_sockerr();
209+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set TCP_KEEPINTVL failed %d", ret);
210+
goto out;
211+
}
212+
213+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPCNT, (const void *)&keep_cnt,
214+
sizeof(keep_cnt));
215+
if (ret) {
216+
ret = -ofi_sockerr();
217+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set SO_KEEPALIVE failed %d", ret);
218+
goto out;
219+
}
220+
221+
FI_INFO(&xnet_prov, FI_LOG_EP_CTRL, "%p KEEPALIVE idle %d intvl %d cnt %d\n",
222+
ep, idle_time, keep_intvl, keep_cnt);
223+
224+
out:
225+
if (ret) {
226+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "%p KEEPALIVE set keepalive failed %d\n",
227+
ep, ret);
228+
xnet_disable_keepalive(ep);
229+
}
230+
231+
return ret;
232+
}
233+
159234
int xnet_setup_socket(SOCKET sock, struct fi_info *info)
160235
{
161236
int ret, optval = 1;
@@ -294,6 +369,16 @@ xnet_ep_accept(struct fid_ep *ep_fid, const void *param, size_t paramlen)
294369
ep->cm_msg->hdr.seg_size = htons((uint16_t) paramlen);
295370
}
296371

372+
/* Enable keepalive to make sure the socket status can be reset in time
373+
* if the remote peer is restarted after it gets connreq but not replies.
374+
*/
375+
ret = xnet_enable_keepalive(ep);
376+
if (ret) {
377+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "%p set tcp keepalive failure:%d\n",
378+
ep, ret);
379+
return ofi_sockerr() ? -ofi_sockerr() : -FI_EINVAL;
380+
}
381+
297382
ret = xnet_send_cm_msg(ep);
298383
if (ret)
299384
return ret;

0 commit comments

Comments
 (0)