Skip to content

Commit 4a0a8ef

Browse files
committed
Enable TCP keepalive during connreq processing, otherwise
if the remote peer is restarted between getting connreq and replying, it may hang there to wait for the connreq reply. Since the ep state(req_done) will not allow sending new reqs, and socket state will not be reset, i.e. progress will not be able to detect the disconnection. Signed-off-by: Di Wang <[email protected]>
1 parent 3d04127 commit 4a0a8ef

File tree

3 files changed

+84
-0
lines changed

3 files changed

+84
-0
lines changed

prov/tcp/src/xnet.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,4 +781,5 @@ int xnet_rdm_ops_open(struct fid *fid, const char *name,
781781
FI_WARN(&xnet_prov, subsystem, log_str "%s (%d)\n", \
782782
fi_strerror((int) -(err)), (int) err)
783783

784+
int xnet_disable_keepalive(struct xnet_ep *ep);
784785
#endif //_XNET_H_

prov/tcp/src/xnet_cm.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,12 @@ void xnet_req_done(struct xnet_ep *ep)
203203
ret = xnet_req_done_internal(ep);
204204
if (ret)
205205
goto disable;
206+
207+
xnet_disable_keepalive(ep);
206208
return;
207209

208210
disable:
211+
xnet_disable_keepalive(ep);
209212
xnet_ep_disable(ep, -ret, ep->cm_msg->data,
210213
ntohs(ep->cm_msg->hdr.seg_size));
211214
}

prov/tcp/src/xnet_ep.c

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,76 @@ static void xnet_set_no_port(SOCKET sock)
156156
#define xnet_set_no_port(sock)
157157
#endif
158158

159+
int
160+
xnet_disable_keepalive(struct xnet_ep *ep)
161+
{
162+
int optval = 0;
163+
int ret;
164+
165+
ret = setsockopt(ep->bsock.sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&optval,
166+
sizeof(optval));
167+
if (ret) {
168+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set SO_KEEPALIVE failed %d", ret);
169+
return -ofi_sockerr();
170+
}
171+
172+
FI_INFO(&xnet_prov, FI_LOG_EP_CTRL, "ep %p KEEPALIVE is disabled.\n", ep);
173+
return ret;
174+
}
175+
176+
static int
177+
xnet_enable_keepalive(struct xnet_ep *ep)
178+
{
179+
int optval = 1;
180+
int idle_time = 5;
181+
int keep_intvl = 2;
182+
int keep_cnt = 2;
183+
int ret;
184+
185+
ret = setsockopt(ep->bsock.sock, SOL_SOCKET, SO_KEEPALIVE, (const void *)&optval,
186+
sizeof(optval));
187+
if (ret) {
188+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set SO_KEEPALIVE failed %d", ret);
189+
return -ofi_sockerr();
190+
}
191+
192+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPIDLE, (const void *)&idle_time,
193+
sizeof(idle_time));
194+
if (ret) {
195+
ret = -ofi_sockerr();
196+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set TCP_KEEPIDLE failed %d", ret);
197+
goto out;
198+
}
199+
200+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPINTVL, (const void *)&keep_intvl,
201+
sizeof(keep_intvl));
202+
if (ret) {
203+
ret = -ofi_sockerr();
204+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set TCP_KEEPINTVL failed %d", ret);
205+
goto out;
206+
}
207+
208+
ret = setsockopt(ep->bsock.sock, IPPROTO_TCP, TCP_KEEPCNT, (const void *)&keep_cnt,
209+
sizeof(keep_cnt));
210+
if (ret) {
211+
ret = -ofi_sockerr();
212+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "set SO_KEEPALIVE failed %d", ret);
213+
goto out;
214+
}
215+
216+
FI_INFO(&xnet_prov, FI_LOG_EP_CTRL, "%p KEEPALIVE idle %d intvl %d cnt %d\n",
217+
ep, idle_time, keep_intvl, keep_cnt);
218+
219+
out:
220+
if (ret) {
221+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "%p KEEPALIVE set keepalive failed %d\n",
222+
ep, ret);
223+
xnet_disable_keepalive(ep);
224+
}
225+
226+
return ret;
227+
}
228+
159229
int xnet_setup_socket(SOCKET sock, struct fi_info *info)
160230
{
161231
int ret, optval = 1;
@@ -294,6 +364,16 @@ xnet_ep_accept(struct fid_ep *ep_fid, const void *param, size_t paramlen)
294364
ep->cm_msg->hdr.seg_size = htons((uint16_t) paramlen);
295365
}
296366

367+
/* Enable keepalive to make sure the socket status can be reset in time
368+
* if the remote peer is restarted after it gets connreq but not replies.
369+
*/
370+
ret = xnet_enable_keepalive(ep);
371+
if (ret) {
372+
FI_WARN(&xnet_prov, FI_LOG_EP_CTRL, "%p set tcp keepalive failure:%d\n",
373+
ep, ret);
374+
return ofi_sockerr() ? -ofi_sockerr() : -FI_EINVAL;
375+
}
376+
297377
ret = xnet_send_cm_msg(ep);
298378
if (ret)
299379
return ret;

0 commit comments

Comments
 (0)