Skip to content

Commit 85aa477

Browse files
committed
UCT/IB/RDMACM: Take async block around unprotected librdmacm calls
The cm event handler runs under the cm's async block (held by ucs_async_handler_dispatch via context_try_block). rdma_destroy_id and related librdmacm state-mutating calls must take the same block to serialize with rdma_get_cm_event() running in the handler; otherwise a concurrent destroy can remove the cm_id's userspace tracking while the handler is mid-lookup inside librdmacm, producing a NULL deref at pthread_mutex_lock(&id_priv->mut) inside rdma_get_cm_event(). The ep destructor and the listener destructor already take the block. Four other call sites did not: - rdmacm_listener.c listener init error path (rdma_destroy_id on rdma_bind_addr / rdma_listen failure) - rdmacm_listener.c uct_rdmacm_listener_reject (rdma_reject + rdma_destroy_id + rdma_ack_cm_event on a connect_request) - rdmacm_cm_ep.c client init error path (rdma_destroy_id on rdma_resolve_addr failure) - rdmacm_cm_ep.c server init (rdma_ack_cm_event + rdma_migrate_id, plus rdma_destroy_id on the error path; the success-path server_send_priv_data already takes the same (recursive) block) Observed as a SIGSEGV inside librdmacm during multi-threaded sockaddr gtests (test_ucp_sockaddr_destroy_ep_on_err, test_ucp_sockaddr_wireup_fail) where ep teardown and connect_request rejection interleave with event delivery on the same channel. Signed-off-by: NirWolfer <nwolfer@nvidia.com>
1 parent 2e11735 commit 85aa477

2 files changed

Lines changed: 19 additions & 1 deletion

File tree

src/uct/ib/rdmacm/rdmacm_cm_ep.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,9 @@ static ucs_status_t uct_rdmacm_cm_ep_client_init(uct_rdmacm_cm_ep_t *cep,
522522
return UCS_OK;
523523

524524
err_destroy_id:
525+
UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep));
525526
uct_rdmacm_cm_destroy_id(cep->id);
527+
UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep));
526528
err:
527529
return status;
528530
}
@@ -587,6 +589,11 @@ static ucs_status_t uct_rdmacm_cm_ep_server_init(uct_rdmacm_cm_ep_t *cep,
587589
ucs_status_t status;
588590
char ep_str[UCT_RDMACM_EP_STRING_LEN];
589591

592+
/* Serialize the librdmacm ack/migrate/destroy calls on this channel
593+
* against the cm event handler. server_send_priv_data takes the same
594+
* (recursive) block on the success path. */
595+
UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep));
596+
590597
status = uct_rdmacm_cm_ack_event(event);
591598
if (status != UCS_OK) {
592599
goto err_destroy_id;
@@ -621,12 +628,14 @@ static ucs_status_t uct_rdmacm_cm_ep_server_init(uct_rdmacm_cm_ep_t *cep,
621628
id, listen_ev_ch, cm, cm->ev_ch);
622629
}
623630

631+
UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep));
624632
return uct_rdmacm_cm_ep_server_send_priv_data(cep, params);
625633

626634
err:
627635
cep->id = NULL;
628636
err_destroy_id:
629637
uct_rdmacm_cm_destroy_id(id);
638+
UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep));
630639
return status;
631640
}
632641

src/uct/ib/rdmacm/rdmacm_listener.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,9 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_listener_t, uct_cm_h cm,
106106
return UCS_OK;
107107

108108
err_destroy_id:
109+
UCS_ASYNC_BLOCK(uct_rdmacm_cm_get_async(rdmacm_cm));
109110
uct_rdmacm_cm_destroy_id(self->id);
111+
UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_get_async(rdmacm_cm));
110112
err:
111113
return status;
112114
}
@@ -119,12 +121,19 @@ ucs_status_t uct_rdmacm_listener_reject(uct_listener_h listener,
119121
uct_rdmacm_cm_t *rdmacm_cm = ucs_derived_of(listener->cm,
120122
uct_rdmacm_cm_t);
121123
struct rdma_cm_event *event = (struct rdma_cm_event*)conn_request;
124+
ucs_status_t status;
122125

123126
ucs_assert_always(rdmacm_listener->id == event->listen_id);
124127

128+
/* Serialize against the cm event handler, which holds the same async
129+
* block when calling rdma_get_cm_event() on this channel. */
130+
UCS_ASYNC_BLOCK(uct_rdmacm_cm_get_async(rdmacm_cm));
125131
uct_rdmacm_cm_reject(rdmacm_cm, event->id);
126132
uct_rdmacm_cm_destroy_id(event->id);
127-
return uct_rdmacm_cm_ack_event(event);
133+
status = uct_rdmacm_cm_ack_event(event);
134+
UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_get_async(rdmacm_cm));
135+
136+
return status;
128137
}
129138

130139
UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_listener_t)

0 commit comments

Comments
 (0)