-
Notifications
You must be signed in to change notification settings - Fork 2k
Introduce failure domains to dRAID #18148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -100,12 +100,16 @@ find_pool(zpool_handle_t *zhp, void *data) | |
| * Find a vdev within a tree with a matching GUID. | ||
| */ | ||
| static nvlist_t * | ||
| find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) | ||
| find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid, | ||
| uint64_t *parent_guid) | ||
| { | ||
| uint64_t guid; | ||
| uint64_t guid, saved_parent_guid; | ||
| nvlist_t **child; | ||
| uint_t c, children; | ||
| nvlist_t *ret; | ||
| nvlist_t *ret = NULL; | ||
|
|
||
| if (parent_guid != NULL) | ||
| saved_parent_guid = *parent_guid; | ||
|
|
||
| if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && | ||
| guid == search_guid) { | ||
|
|
@@ -119,29 +123,38 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) | |
| return (NULL); | ||
|
|
||
| for (c = 0; c < children; c++) { | ||
| if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) | ||
| return (ret); | ||
| if ((ret = find_vdev(zhdl, child[c], search_guid, | ||
| parent_guid)) != NULL) | ||
| goto out; | ||
| } | ||
|
|
||
| if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, | ||
| &child, &children) != 0) | ||
| return (NULL); | ||
|
|
||
| for (c = 0; c < children; c++) { | ||
| if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) | ||
| return (ret); | ||
| if ((ret = find_vdev(zhdl, child[c], search_guid, | ||
| parent_guid)) != NULL) | ||
| goto out; | ||
| } | ||
|
|
||
| if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, | ||
| &child, &children) != 0) | ||
| return (NULL); | ||
|
|
||
| for (c = 0; c < children; c++) { | ||
| if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) | ||
| return (ret); | ||
| if ((ret = find_vdev(zhdl, child[c], search_guid, | ||
| parent_guid)) != NULL) | ||
| goto out; | ||
| } | ||
|
|
||
| return (NULL); | ||
| out: | ||
| /* If parent_guid was set, don't reset it. */ | ||
| if (ret != NULL && parent_guid != NULL && | ||
| saved_parent_guid == *parent_guid) | ||
| *parent_guid = guid; | ||
| return (ret); | ||
| } | ||
|
|
||
| static int | ||
|
|
@@ -203,11 +216,12 @@ find_and_remove_spares(libzfs_handle_t *zhdl, uint64_t vdev_guid) | |
| } | ||
|
|
||
| /* | ||
| * Given a (pool, vdev) GUID pair, find the matching pool and vdev. | ||
| * Given a (pool, vdev) GUID pair, find the matching pool, vdev and | ||
| * its top_guid. | ||
| */ | ||
| static zpool_handle_t * | ||
| find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, | ||
| nvlist_t **vdevp) | ||
| find_by_guid_impl(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, | ||
| nvlist_t **vdevp, uint64_t *top_guid) | ||
| { | ||
| find_cbdata_t cb; | ||
| zpool_handle_t *zhp; | ||
|
|
@@ -229,7 +243,8 @@ find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, | |
| } | ||
|
|
||
| if (vdev_guid != 0) { | ||
| if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { | ||
| if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid, | ||
| top_guid)) == NULL) { | ||
| zpool_close(zhp); | ||
| return (NULL); | ||
| } | ||
|
|
@@ -238,6 +253,96 @@ find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, | |
| return (zhp); | ||
| } | ||
|
|
||
| /* | ||
| * Given a (pool, vdev) GUID pair, find the matching pool and vdev. | ||
| */ | ||
| static zpool_handle_t * | ||
| find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, | ||
| nvlist_t **vdevp) | ||
| { | ||
| return (find_by_guid_impl(zhdl, pool_guid, vdev_guid, vdevp, NULL)); | ||
| } | ||
|
|
||
| /* | ||
| * Given a (pool, vdev) GUID pair, count the number of faulted vdevs in | ||
| * its top vdev and return TRUE if the number of failures at i-th device | ||
| * index in each dRAID failure group, equals to the number of failure groups, | ||
| * which means it's the domain failure, and the vdev is one of those faults. | ||
| * Otherwise, return FALSE. | ||
| */ | ||
| static boolean_t | ||
| is_draid_fdomain_failure(libzfs_handle_t *zhdl, uint64_t pool_guid, | ||
| uint64_t vdev_guid) | ||
| { | ||
| uint64_t guid, top_guid; | ||
| uint64_t children; | ||
| nvlist_t *nvtop, *vdev, **child; | ||
| vdev_stat_t *vs; | ||
| uint_t i, c, vdev_i = UINT_MAX, width, *nfaults_map; | ||
|
|
||
| if (find_by_guid_impl(zhdl, pool_guid, vdev_guid, &vdev, | ||
| &top_guid) == NULL) | ||
| return (B_FALSE); | ||
|
|
||
| if (find_by_guid_impl(zhdl, pool_guid, top_guid, &nvtop, | ||
| NULL) == NULL) | ||
| return (B_FALSE); | ||
|
|
||
| if (nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, | ||
| &child, &width) != 0) | ||
| return (B_FALSE); | ||
|
|
||
| if (nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_DRAID_NCHILDREN, | ||
| &children) != 0) /* not dRAID */ | ||
| return (B_FALSE); | ||
|
|
||
| if (width == children) /* dRAID without failure domains */ | ||
| return (B_FALSE); | ||
|
|
||
| /* | ||
| * No rush with starting resilver, it can be domain failure, | ||
| * in which case we need to wait a little to allow more devices | ||
| * to get into faulted state so that we could detect that | ||
| * it's the domain failure indeed. | ||
| */ | ||
| sleep(5); | ||
|
|
||
| nfaults_map = calloc(children, sizeof (*nfaults_map)); | ||
| if (nfaults_map == NULL) { | ||
| perror("malloc"); | ||
| exit(EXIT_FAILURE); | ||
| } | ||
|
|
||
| for (c = 0; c < width; c++) { | ||
| nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_VDEV_STATS, | ||
| (uint64_t **)&vs, &i); | ||
|
|
||
| if (vs->vs_state == VDEV_STATE_FAULTED) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only checking for FAULTED is probably too strict since if the device is unavailable for another reason this logic won't detect it. For example, it was already replaced by a distributed hot spare. Checking for |
||
| nfaults_map[c % children]++; | ||
|
|
||
| if (vs->vs_state == VDEV_STATE_FAULTED && | ||
| nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, | ||
| &guid) == 0 && guid == vdev_guid) | ||
| vdev_i = (c % children); | ||
| } | ||
|
|
||
| boolean_t res = B_FALSE; | ||
| for (c = 0; c < children; c++) { | ||
| if (c == vdev_i && nfaults_map[c] == (width / children)) { | ||
| res = B_TRUE; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| free(nfaults_map); | ||
|
|
||
| if (res) | ||
| fmd_hdl_debug(fmd_module_hdl("zfs-retire"), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| "vdev %llu belongs to draid fdomain failure", vdev_guid); | ||
|
|
||
| return (res); | ||
| } | ||
|
|
||
| /* | ||
| * Given a vdev, attempt to replace it with every known spare until one | ||
| * succeeds or we run out of devices to try. | ||
|
|
@@ -445,6 +550,14 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, | |
| if (vs->vs_state == VDEV_STATE_OFFLINE) | ||
| return; | ||
|
|
||
| /* | ||
| * Resilvering domain failures can take a lot of computing and | ||
| * I/O bandwidth resources, only to be wasted when the failed | ||
| * domain component (for example enclosure) is replaced. | ||
| */ | ||
| if (is_draid_fdomain_failure(zhdl, pool_guid, vdev_guid)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need to investigate further but when locally testing with a |
||
| return; | ||
|
|
||
| /* | ||
| * If state removed is requested for already removed vdev, | ||
| * its a loopback event from spa_async_remove(). Just | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: let's use the
fmd_hdl_alloc/fmd_hdl_freewrappers withFMD_SLEEPwhich can't fail.