Skip to content

Commit 6ae4922

Browse files
authored
DAOS-16796 container: server handle cleanup (daos-stack#16793)
Server handle cleanup to avoid potential deadlock & other issues. - Never try to fetch server handle through IV_CONT_CAPA. - Never try to lookup server handle in normal handle DB. - Don't call ds_cont_tgt_open/close() for IV_POOL_HDL to avoid race. - Use same ULT to periodically report EC agg epoch, fetch pool connection handles, server open handles. - Don't put server handle in container handle hash, otherwise, the handle will be leaked on container stop/destroy. - I/O request handler won't go through normal handle code path when the server handle is used. If the server handle isn't propagated to server, -DER_STALE will be returned for retry. - Introduce ds_cont_child::sc_open_mutex to serialize cont open in the ds_cont_local_open(). - Fix error cleanup defect in ds_cont_local_open() which could call dtx_cont_close() mistakenly. Signed-off-by: Niu Yawei <yawei.niu@hpe.com>
1 parent 7552446 commit 6ae4922

9 files changed

Lines changed: 460 additions & 344 deletions

File tree

src/container/container_iv.c

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -502,13 +502,7 @@ cont_iv_ent_fetch(struct ds_iv_entry *entry, struct ds_iv_key *key,
502502
struct daos_prop_entry *prop_entry;
503503
struct daos_co_status stat = { 0 };
504504

505-
if (uuid_is_null(chdl.ch_cont)) {
506-
/* Skip for container server handler */
507-
iv_entry.iv_capa.sec_capas =
508-
ds_sec_get_rebuild_cont_capabilities();
509-
iv_entry.iv_capa.flags = 0;
510-
D_GOTO(out, rc = 0);
511-
}
505+
D_ASSERT(!uuid_is_null(chdl.ch_cont));
512506
rc = ds_cont_get_prop(entry->ns->iv_pool_uuid,
513507
chdl.ch_cont, &prop);
514508
if (rc) {
@@ -980,6 +974,11 @@ cont_iv_capa_refresh_ult(void *data)
980974
D_GOTO(out, rc);
981975

982976
D_ASSERT(pool != NULL);
977+
/* Trying to fetch server handle through IV_CONT_CAPA indicates a BUG */
978+
D_ASSERTF(uuid_compare(pool->sp_srv_cont_hdl, arg->cont_hdl_uuid) != 0,
979+
"srv_hdl:" DF_UUID ", hdl:" DF_UUID "", DP_UUID(pool->sp_srv_cont_hdl),
980+
DP_UUID(arg->cont_hdl_uuid));
981+
983982
if (arg->invalidate_current) {
984983
rc = cont_iv_capability_invalidate(pool->sp_iv_ns,
985984
arg->cont_hdl_uuid,
@@ -1662,9 +1661,68 @@ ds_cont_fetch_prop(uuid_t po_uuid, uuid_t co_uuid, daos_prop_t *cont_prop)
16621661
return cont_iv_prop_fetch(po_uuid, co_uuid, cont_prop);
16631662
}
16641663

1664+
struct copy_hdl_arg {
1665+
struct ds_pool *pool;
1666+
uuid_t srv_cont_hdl;
1667+
};
1668+
1669+
static int
1670+
copy_srv_cont_hdl(void *arg)
1671+
{
1672+
struct copy_hdl_arg *copy_arg = arg;
1673+
struct ds_pool *pool = copy_arg->pool;
1674+
1675+
if (!uuid_is_null(pool->sp_srv_cont_hdl)) {
1676+
uuid_copy(copy_arg->srv_cont_hdl, pool->sp_srv_cont_hdl);
1677+
return 0;
1678+
}
1679+
return -DER_NO_HDL;
1680+
}
1681+
16651682
int
16661683
ds_cont_find_hdl(uuid_t po_uuid, uuid_t coh_uuid, struct ds_cont_hdl **coh_p)
16671684
{
1685+
struct ds_pool_child *pool_child;
1686+
struct ds_cont_hdl *hdl;
1687+
1688+
pool_child = ds_pool_child_lookup(po_uuid);
1689+
if (pool_child == NULL) {
1690+
D_ERROR(DF_UUID ": Failed to find pool child.", DP_UUID(po_uuid));
1691+
return -DER_NO_HDL;
1692+
}
1693+
1694+
/* Return a retry-able error when the srv handle not propagated */
1695+
if (d_list_empty(&pool_child->spc_srv_cont_hdl)) {
1696+
struct copy_hdl_arg arg;
1697+
int rc;
1698+
1699+
/*
1700+
* Sometimes the srv container handle failed to be propagated to the pool
1701+
* child when it's target is in DOWN state. Let's fix it here.
1702+
*/
1703+
rc = dss_ult_execute(copy_srv_cont_hdl, &arg, NULL, NULL, DSS_XS_SYS, 0, 0);
1704+
if (!rc) {
1705+
rc = ds_cont_srv_open(po_uuid, arg.srv_cont_hdl);
1706+
if (!rc) {
1707+
D_ASSERT(!d_list_empty(&pool_child->spc_srv_cont_hdl));
1708+
goto srv_hdl_ready;
1709+
}
1710+
}
1711+
ds_pool_child_put(pool_child);
1712+
D_INFO(DF_UUID ": Server handle isn't propagated yet.\n", DP_UUID(po_uuid));
1713+
return -DER_STALE;
1714+
}
1715+
1716+
srv_hdl_ready:
1717+
hdl = d_list_entry(pool_child->spc_srv_cont_hdl.next, struct ds_cont_hdl, sch_link);
1718+
D_ASSERT(!uuid_is_null(hdl->sch_uuid));
1719+
1720+
ds_pool_child_put(pool_child);
1721+
if (uuid_compare(hdl->sch_uuid, coh_uuid) == 0) {
1722+
*coh_p = hdl;
1723+
return 0;
1724+
}
1725+
16681726
/* NB: it can be called from any xstream */
16691727
return cont_iv_hdl_fetch(coh_uuid, po_uuid, coh_p);
16701728
}

src/container/srv_container.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6014,14 +6014,14 @@ ds_cont_hdl_rdb_lookup(uuid_t pool_uuid, uuid_t cont_hdl_uuid, struct container_
60146014

60156015
rc = cont_svc_lookup_leader(pool_uuid, 0 /* id */, &svc, NULL);
60166016
if (rc != 0) {
6017-
D_ERROR(DF_CONT": find leader: %d\n",
6018-
DP_CONT(pool_uuid, cont_hdl_uuid), rc);
6017+
D_ERROR(DF_UUID ": find leader: %d\n", DP_UUID(pool_uuid), rc);
60196018
return rc;
60206019
}
60216020

6022-
/* check if it is server container hdl */
6023-
if (uuid_compare(cont_hdl_uuid, svc->cs_pool->sp_srv_cont_hdl) == 0)
6024-
D_GOTO(put, rc);
6021+
/* Lookup server handle in container open handle DB indicates a BUG */
6022+
D_ASSERTF(uuid_compare(cont_hdl_uuid, svc->cs_pool->sp_srv_cont_hdl) != 0,
6023+
"srv hdl:" DF_UUID ", hdl:" DF_UUID "\n", DP_UUID(svc->cs_pool->sp_srv_cont_hdl),
6024+
DP_UUID(cont_hdl_uuid));
60256025

60266026
rc = rdb_tx_begin(svc->cs_rsvc->s_db, svc->cs_rsvc->s_term, &tx);
60276027
if (rc != 0)

0 commit comments

Comments
 (0)