Skip to content

Commit 7552446

Browse files
authored
DAOS-17357 rebuild: use dmg for stop/start, force-stop fix (daos-stack#16890)
Remove use of fault injection for interactive rebuild testing in daos_test -v (rebuild simple tests). Replace with new dmg command helper functions to invoke the control plane commands dmg pool rebuild stop and dmg pool rebuild start. Some common test functions are added and modified to support this. Also, some rebuild engine logic is updated to remove the associated fault injection handling. Restore the engine rebuild_ults() logic so that it does not loop forever (a change made in initial implementation for fault injection). Also, update the engine rebuild stop logic so it does not allow "dmg pool rebuild stop --force" to stop a rebuild in op:Fail_reclaim, unless it has been failing repeatedly. For this, new counts are added in struct rebuild_task and struct rebulid_global_pool_tracker to track, for a high level rebuild operation, how many times op:Rebuild, Reclaim, Fail_reclaim, etc. have been run (and how many times op:Rebuild and op:Fail_reclaim have failed). daos_test -v rebuild_many_objects_with_failure() is updated to specifically use the --force option to exercise arrival of command during Fail_reclaim, and exercise that the rebuild will not be stopped during Fail_reclaim in this instance. i.e., it will work the same as without the --force option, preventing retry of original op:Rebuild after the one/only Fail_reclaim is done. Signed-off-by: Kenneth Cain <kenneth.cain@hpe.com>
1 parent 0da5992 commit 7552446

11 files changed

Lines changed: 547 additions & 241 deletions

File tree

src/common/tests_dmg_helpers.c

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,80 @@ dmg_pool_list(const char *dmg_config_file, const char *group,
12061206
return rc;
12071207
}
12081208

1209+
int
1210+
dmg_pool_rebuild_stop(const char *dmg_config_file, const uuid_t uuid, const char *grp, bool force)
1211+
{
1212+
char uuid_str[DAOS_UUID_STR_SIZE];
1213+
int argcount = 0;
1214+
char **args = NULL;
1215+
struct json_object *dmg_out = NULL;
1216+
int rc = 0;
1217+
1218+
uuid_unparse_lower(uuid, uuid_str);
1219+
args = cmd_push_arg(args, &argcount, "%s ", uuid_str);
1220+
if (args == NULL)
1221+
D_GOTO(out, rc = -DER_NOMEM);
1222+
1223+
if (grp != NULL) {
1224+
args = cmd_push_arg(args, &argcount, "--sys=%s ", grp);
1225+
if (args == NULL)
1226+
D_GOTO(out, rc = -DER_NOMEM);
1227+
}
1228+
1229+
if (force) {
1230+
args = cmd_push_arg(args, &argcount, "--force");
1231+
if (args == NULL)
1232+
D_GOTO(out, rc = -DER_NOMEM);
1233+
}
1234+
1235+
rc = daos_dmg_json_pipe("pool rebuild stop", dmg_config_file, args, argcount, &dmg_out);
1236+
if (rc != 0) {
1237+
D_ERROR("dmg pool rebuild stop failed\n");
1238+
goto out_json;
1239+
}
1240+
1241+
out_json:
1242+
if (dmg_out != NULL)
1243+
json_object_put(dmg_out);
1244+
cmd_free_args(args, argcount);
1245+
out:
1246+
return rc;
1247+
}
1248+
1249+
int
1250+
dmg_pool_rebuild_start(const char *dmg_config_file, const uuid_t uuid, const char *grp)
1251+
{
1252+
char uuid_str[DAOS_UUID_STR_SIZE];
1253+
int argcount = 0;
1254+
char **args = NULL;
1255+
struct json_object *dmg_out = NULL;
1256+
int rc = 0;
1257+
1258+
uuid_unparse_lower(uuid, uuid_str);
1259+
args = cmd_push_arg(args, &argcount, "%s ", uuid_str);
1260+
if (args == NULL)
1261+
D_GOTO(out, rc = -DER_NOMEM);
1262+
1263+
if (grp != NULL) {
1264+
args = cmd_push_arg(args, &argcount, "--sys=%s ", grp);
1265+
if (args == NULL)
1266+
D_GOTO(out, rc = -DER_NOMEM);
1267+
}
1268+
1269+
rc = daos_dmg_json_pipe("pool rebuild start", dmg_config_file, args, argcount, &dmg_out);
1270+
if (rc != 0) {
1271+
D_ERROR("dmg pool rebuild start failed\n");
1272+
goto out_json;
1273+
}
1274+
1275+
out_json:
1276+
if (dmg_out != NULL)
1277+
json_object_put(dmg_out);
1278+
cmd_free_args(args, argcount);
1279+
out:
1280+
return rc;
1281+
}
1282+
12091283
static int
12101284
parse_device_info(struct json_object *smd_dev, device_list *devices,
12111285
char *host, int dev_length, int *disks)

src/include/daos/common.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -789,9 +789,6 @@ enum {
789789
#define DAOS_RDB_SKIP_APPENDENTRIES_FAIL (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1a)
790790
#define DAOS_FORCE_REFRESH_POOL_MAP (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1b)
791791

792-
#define DAOS_REBUILD_ADMIN_STOP (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1c)
793-
#define DAOS_REBUILD_ADMIN_START (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1d)
794-
795792
#define DAOS_FORCE_CAPA_FETCH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1e)
796793
#define DAOS_FORCE_PROP_VERIFY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1f)
797794

src/include/daos/tests_lib.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,25 @@ dmg_pool_set_prop(const char *dmg_config_file,
484484
int dmg_pool_get_prop(const char *dmg_config_file, const char *label, const uuid_t uuid,
485485
const char *name, char **value);
486486

487+
/**
488+
* Interactively stop a pool's currently-running rebuild.
489+
* \param dmg_config_file [IN] DMG config file.
490+
* \param uuid [IN] UUID of the pool.
491+
* \param grp [IN] Process set name of the DAOS servers managing the pool.
492+
* \param force [IN] forcibly stop a rebuild that is failing.
493+
*/
494+
int
495+
dmg_pool_rebuild_stop(const char *dmg_config_file, const uuid_t uuid, const char *grp, bool force);
496+
497+
/**
498+
* Interactively start/resume a pool's rebuilding.
499+
* \param dmg_config_file [IN] DMG config file.
500+
* \param uuid [IN] UUID of the pool.
501+
* \param grp [IN] Process set name of the DAOS servers managing the pool.
502+
*/
503+
int
504+
dmg_pool_rebuild_start(const char *dmg_config_file, const uuid_t uuid, const char *grp);
505+
487506
/**
488507
* List all disks in the specified DAOS system.
489508
*

src/include/daos_srv/rebuild.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ int
8787
ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t stable_eph,
8888
uint32_t layout_version, struct pool_target_id_list *tgts,
8989
daos_rebuild_opc_t rebuild_op, daos_rebuild_opc_t retry_rebuild_op,
90-
uint32_t retry_map_ver, bool stop_admin, uint64_t delay_sec);
90+
uint32_t retry_map_ver, bool stop_admin, void *cur_taskp, uint64_t delay_sec);
9191
void ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank);
9292
int ds_rebuild_query(uuid_t pool_uuid,
9393
struct daos_rebuild_status *status);

src/pool/srv_pool.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6410,7 +6410,7 @@ pool_check_upgrade_object_layout(struct rdb_tx *tx, struct pool_svc *svc,
64106410
rc = ds_rebuild_schedule(svc->ps_pool, svc->ps_pool->sp_map_version, upgrade_eph,
64116411
DAOS_POOL_OBJ_VERSION, NULL, RB_OP_UPGRADE,
64126412
RB_OP_NONE /* retry_rebuild_op */, 0 /* retry_map_ver */,
6413-
false /* stop_admin */, 0);
6413+
false /* stop_admin */, NULL /* cur_taskp */, 0);
64146414
if (rc == 0)
64156415
*scheduled_layout_upgrade = true;
64166416
}
@@ -7758,7 +7758,8 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
77587758
if (tgt_map_ver != 0) {
77597759
rc = ds_rebuild_schedule(svc->ps_pool, tgt_map_ver, rebuild_eph, 0, &target_list,
77607760
RB_OP_REBUILD, RB_OP_NONE /* retry_rebuild_op */,
7761-
0 /* retry_map_ver */, false /* stop_admin */, delay);
7761+
0 /* retry_map_ver */, false /* stop_admin */,
7762+
NULL /* cur_taskp */, delay);
77627763
if (rc != 0) {
77637764
D_ERROR("rebuild fails rc: "DF_RC"\n", DP_RC(rc));
77647765
D_GOTO(out, rc);

src/rebuild/rebuild_internal.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ struct rebuild_global_pool_tracker {
168168
uint32_t rgt_opc;
169169
unsigned int rgt_abort : 1, /* abort: kill rebuild */
170170
rgt_init_scan : 1, rgt_stop_admin : 1; /* stop: admin has asked to kill rebuild */
171+
172+
uint32_t rgt_num_op_rb; /* count of op:Rebuild attempts */
173+
uint32_t rgt_num_op_freclaim; /* count of op:Fail_reclaim attempts */
174+
uint32_t rgt_num_op_rb_fail; /* count of op:Rebuild failures */
175+
uint32_t rgt_num_op_freclaim_fail; /* count of failed op:Fail_reclaim (not good) */
171176
};
172177

173178
/* Structure on raft replica nodes to serve completed rebuild status querying */
@@ -254,6 +259,14 @@ struct rebuild_task {
254259
* Then, on fail_reclaim finish, the pool rebuild state will be set to idle (NOT_STARTED).
255260
*/
256261
bool dst_stop_admin;
262+
263+
/* Track how many tries for certain daos_rebuild_opc_t */
264+
uint32_t dst_num_op_rb; /* count of tries to run rebuild */
265+
uint32_t dst_num_op_reclaim;
266+
uint32_t dst_num_op_freclaim;
267+
uint32_t dst_num_op_upgrade;
268+
uint32_t dst_num_op_rb_fail; /* count of rebuild failures */
269+
uint32_t dst_num_op_freclaim_fail; /* count of Fail_recliam failures */
257270
};
258271

259272
/* Per pool structure in TLS to check pool rebuild status

0 commit comments

Comments
 (0)