Skip to content

Commit d142c6d

Browse files
DAOS-18976 rebuild: refine rebuild gen handling (#18380)
1. For RECLAIM/FAIL_RECLAIM always bump the rebuild gen. 2. abort local rpt when rgt done. Some log refinning Signed-off-by: Xuezhao Liu <xuezhao.liu@hpe.com> Co-authored-by: Liang Zhen <gnailzenh@gmail.com>
1 parent d9993b9 commit d142c6d

3 files changed

Lines changed: 32 additions & 12 deletions

File tree

src/rebuild/rebuild_iv.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* (C) Copyright 2017-2024 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
*
55
* SPDX-License-Identifier: BSD-2-Clause-Patent
66
*/
@@ -188,8 +188,8 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
188188

189189
if (ref_rc != 0) {
190190
rc = ref_rc;
191-
DL_WARN(rc, DF_UUID "bypass refresh, IV class id %d.",
192-
DP_UUID(entry->ns->iv_pool_uuid), key->class_id);
191+
DL_WARN(rc, DF_RB ", IV ns pool " DF_UUID "bypass refresh, IV class id %d.",
192+
DP_RB_RPT(rpt), DP_UUID(entry->ns->iv_pool_uuid), key->class_id);
193193
goto out;
194194
}
195195

src/rebuild/scan.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,14 +131,17 @@ rebuild_obj_send_cb(struct tree_cache_root *root, struct rebuild_send_arg *arg)
131131

132132
if (rpt->rt_abort || rpt->rt_finishing || rpt->rt_global_done) {
133133
rc = -DER_SHUTDOWN;
134-
DL_INFO(rc, DF_RB ": give up ds_object_migrate_send, shutdown rebuild",
135-
DP_RB_RPT(rpt));
134+
DL_INFO(rc,
135+
DF_RB ": rt_abort %d, rt_finishing %d, rt_global_done %d, "
136+
"give up ds_object_migrate_send, shutdown rebuild",
137+
DP_RB_RPT(rpt), rpt->rt_abort, rpt->rt_finishing,
138+
rpt->rt_global_done);
136139
break;
137140
}
138141

139142
/* otherwise let's retry */
140-
D_DEBUG(DB_REBUILD, DF_UUID" retry send object to tgt_id %d\n",
141-
DP_UUID(rpt->rt_pool_uuid), arg->tgt_id);
143+
D_DEBUG(DB_REBUILD, DF_RB " retry send object to tgt_id %d\n", DP_RB_RPT(rpt),
144+
arg->tgt_id);
142145
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
143146
}
144147
out:
@@ -360,8 +363,10 @@ rebuild_objects_send_ult(void *data)
360363
D_FREE(ephs);
361364
if (punched_ephs != NULL)
362365
D_FREE(punched_ephs);
363-
if (rc != 0 && tls->rebuild_pool_status == 0)
366+
if (rc != 0 && tls->rebuild_pool_status == 0) {
367+
DL_ERROR(rc, DF_RB " set rebuild_pool_status as failed", DP_RB_RPT(rpt));
364368
tls->rebuild_pool_status = rc;
369+
}
365370

366371
rpt_put(rpt);
367372
}
@@ -1305,6 +1310,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc)
13051310
rpt->rt_re_report = 1;
13061311

13071312
rpt->rt_leader_rank = rsi->rsi_master_rank;
1313+
rpt->rt_rebuild_gen = rsi->rsi_rebuild_gen;
13081314

13091315
/* If this is the old leader, then also stop the rebuild tracking ULT. */
13101316
rebuild_leader_stop(rsi->rsi_pool_uuid, rsi->rsi_rebuild_ver,

src/rebuild/srv.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,7 +1459,8 @@ rebuild_leader_start(struct ds_pool *pool, struct rebuild_task *task,
14591459
*/
14601460
ds_rebuild_running_query_adv(pool->sp_uuid, -1, &version, NULL, &generation,
14611461
&rebuild_leader_rank, &rebuild_leader_term);
1462-
if ((version < task->dst_map_ver) ||
1462+
if (task->dst_rebuild_op == RB_OP_RECLAIM || task->dst_rebuild_op == RB_OP_FAIL_RECLAIM ||
1463+
(version < task->dst_map_ver) ||
14631464
(version == task->dst_map_ver && leader_rank == rebuild_leader_rank &&
14641465
leader_term == rebuild_leader_term))
14651466
generation = ++pool->sp_rebuild_gen;
@@ -1806,6 +1807,8 @@ rebuild_task_ult(void *arg)
18061807
* rebuild.
18071808
*/
18081809
if (rgt && rgt->rgt_init_scan) {
1810+
struct rebuild_tgt_pool_tracker *local_rpt;
1811+
18091812
if (myrank != pool->sp_iv_ns->iv_master_rank) {
18101813
/* If master has been changed, then let's skip
18111814
* iv sync, and the new leader will take over
@@ -1817,6 +1820,14 @@ rebuild_task_ult(void *arg)
18171820
}
18181821

18191822
rebuild_leader_status_notify(rgt, pool, task->dst_rebuild_op, myrank);
1823+
1824+
local_rpt = rpt_lookup(pool->sp_uuid, task->dst_rebuild_op, rgt->rgt_rebuild_ver,
1825+
rgt->rgt_rebuild_gen);
1826+
if (local_rpt) {
1827+
local_rpt->rt_abort = 1;
1828+
D_INFO(DF_RB " set rt_abort", DP_RB_RPT(local_rpt));
1829+
rpt_put(local_rpt);
1830+
}
18201831
}
18211832

18221833
try_reschedule:
@@ -2385,8 +2396,7 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt)
23852396
/* destroy the migrate_tls of 0-xstream */
23862397
ds_migrate_stop(rpt->rt_pool, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen);
23872398
/* No one should access rpt after rebuild_fini_one. */
2388-
D_INFO("Finalized rebuild for "DF_UUID", map_ver=%u.\n",
2389-
DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver);
2399+
DL_INFO(rc, DF_RB " Finalized rebuild", DP_RB_RPT(rpt));
23902400
rpt_delete(rpt);
23912401
}
23922402

@@ -2515,8 +2525,12 @@ rebuild_tgt_status_check_ult(void *arg)
25152525
* it can not find the IV see crt_iv_hdlr_xx().
25162526
* let's just stop the rebuild.
25172527
*/
2518-
if (rc == -DER_NONEXIST && !status.rebuilding)
2528+
if (rc == -DER_NONEXIST && !status.rebuilding) {
2529+
D_INFO(DF_RB ", rc %d, status.rebuilding %d, "
2530+
"set rt_global_done",
2531+
DP_RB_RPT(rpt), rc, status.rebuilding);
25192532
rpt->rt_global_done = 1;
2533+
}
25202534

25212535
if (ns->iv_stop) {
25222536
D_DEBUG(DB_REBUILD, "abort rebuild "

0 commit comments

Comments
 (0)