diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index 206caa16baa6..8058f17ce560 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -726,7 +726,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) */ if (isresource) { zfs_stats.resource_drops.fmds_value.ui64++; - fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + fmd_hdl_debug(hdl, "discarding '%s' for vdev %llu", class, vdev_guid); return; } diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index d68272bea731..40ad346a8624 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -100,12 +100,16 @@ find_pool(zpool_handle_t *zhp, void *data) * Find a vdev within a tree with a matching GUID. */ static nvlist_t * -find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid, + uint64_t *parent_guid) { - uint64_t guid; + uint64_t guid, saved_parent_guid; nvlist_t **child; uint_t c, children; - nvlist_t *ret; + nvlist_t *ret = NULL; + + if (parent_guid != NULL) + saved_parent_guid = *parent_guid; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && guid == search_guid) { @@ -119,8 +123,9 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) return (NULL); for (c = 0; c < children; c++) { - if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) - return (ret); + if ((ret = find_vdev(zhdl, child[c], search_guid, + parent_guid)) != NULL) + goto out; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, @@ -128,8 +133,9 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) return (NULL); for (c = 0; c < children; c++) { - if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) - return (ret); + if ((ret = find_vdev(zhdl, child[c], search_guid, + parent_guid)) != NULL) + goto out; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, @@ -137,11 +143,18 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) return (NULL); for (c = 0; c < children; c++) { - if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) - return (ret); + if ((ret = find_vdev(zhdl, child[c], search_guid, + parent_guid)) != NULL) + goto out; } return (NULL); +out: + /* If parent_guid was set, don't reset it. */ + if (ret != NULL && parent_guid != NULL && + saved_parent_guid == *parent_guid) + *parent_guid = guid; + return (ret); } static int @@ -203,11 +216,12 @@ find_and_remove_spares(libzfs_handle_t *zhdl, uint64_t vdev_guid) } /* - * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + * Given a (pool, vdev) GUID pair, find the matching pool, vdev and + * its top_guid. */ static zpool_handle_t * -find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, - nvlist_t **vdevp) +find_by_guid_impl(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp, uint64_t *top_guid) { find_cbdata_t cb; zpool_handle_t *zhp; @@ -229,7 +243,8 @@ find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, } if (vdev_guid != 0) { - if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { + if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid, + top_guid)) == NULL) { zpool_close(zhp); return (NULL); } @@ -238,6 +253,96 @@ find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, return (zhp); } +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + return (find_by_guid_impl(zhdl, pool_guid, vdev_guid, vdevp, NULL)); +} + +/* + * Given a (pool, vdev) GUID pair, count the number of faulted vdevs in + * its top vdev and return TRUE if the number of failures at i-th device + * index in each dRAID failure group, equals to the number of failure groups, + * which means it's the domain failure, and the vdev is one of those faults. + * Otherwise, return FALSE. + */ +static boolean_t +is_draid_fdomain_failure(libzfs_handle_t *zhdl, uint64_t pool_guid, + uint64_t vdev_guid) +{ + uint64_t guid, top_guid; + uint64_t children; + nvlist_t *nvtop, *vdev, **child; + vdev_stat_t *vs; + uint_t i, c, vdev_i = UINT_MAX, width, *nfaults_map; + + if (find_by_guid_impl(zhdl, pool_guid, vdev_guid, &vdev, + &top_guid) == NULL) + return (B_FALSE); + + if (find_by_guid_impl(zhdl, pool_guid, top_guid, &nvtop, + NULL) == NULL) + return (B_FALSE); + + if (nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, + &child, &width) != 0) + return (B_FALSE); + + if (nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_DRAID_NCHILDREN, + &children) != 0) /* not dRAID */ + return (B_FALSE); + + if (width == children) /* dRAID without failure domains */ + return (B_FALSE); + + /* + * No rush with starting resilver, it can be domain failure, + * in which case we need to wait a little to allow more devices + * to get into faulted state so that we could detect that + * it's the domain failure indeed. + */ + sleep(5); + + nfaults_map = calloc(children, sizeof (*nfaults_map)); + if (nfaults_map == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + for (c = 0; c < width; c++) { + nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &i); + + if (vs->vs_state == VDEV_STATE_FAULTED) + nfaults_map[c % children]++; + + if (vs->vs_state == VDEV_STATE_FAULTED && + nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &guid) == 0 && guid == vdev_guid) + vdev_i = (c % children); + } + + boolean_t res = B_FALSE; + for (c = 0; c < children; c++) { + if (c == vdev_i && nfaults_map[c] == (width / children)) { + res = B_TRUE; + break; + } + } + + free(nfaults_map); + + if (res) + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "vdev %llu belongs to draid fdomain failure", vdev_guid); + + return (res); +} + /* * Given a vdev, attempt to replace it with every known spare until one * succeeds or we run out of devices to try. @@ -445,6 +550,14 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, if (vs->vs_state == VDEV_STATE_OFFLINE) return; + /* + * Resilvering domain failures can take a lot of computing and + * I/O bandwidth resources, only to be wasted when the failed + * domain component (for example enclosure) is replaced. + */ + if (is_draid_fdomain_failure(zhdl, pool_guid, vdev_guid)) + return; + /* * If state removed is requested for already removed vdev, * its a loopback event from spa_async_remove(). Just diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 265d7488dd8a..0bf33de8de66 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3528,6 +3528,11 @@ show_import(nvlist_t *config, boolean_t report_error) "accessed by another system.\n")); break; + case ZPOOL_STATUS_FAULTED_FDOM_R: + (void) printf_color(ANSI_YELLOW, gettext("One or more failure " + " domains are faulted.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf_color(ANSI_YELLOW, gettext("One or more devices " @@ -8039,7 +8044,7 @@ zpool_do_online(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("failed to open pool " - "\"%s\""), poolname); + "\"%s\"\n"), poolname); return (1); } @@ -8183,7 +8188,7 @@ zpool_do_offline(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("failed to open pool " - "\"%s\""), poolname); + "\"%s\"\n"), poolname); return (1); } @@ -10725,6 +10730,18 @@ print_status_reason(zpool_handle_t *zhp, status_cbdata_t *cbp, "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; + case ZPOOL_STATUS_FAULTED_FDOM_R: + (void) snprintf(status, ST_SIZE, + gettext("One or more failure domains are faulted. " + "The storage devices may be\n\tintact. Sufficient " + "replicas exist for the pool to continue functioning\n\t" + "in a degraded state.\n")); + (void) snprintf(action, AC_SIZE, + gettext("Replace the faulted domain device, " + "or use 'zpool clear' to mark domain\n\tstorage devices " + "repaired.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_NR: (void) snprintf(status, ST_SIZE, gettext("One or more devices are " diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index d1e9ef76dc10..fe9f574ab7bf 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -1323,36 +1323,44 @@ is_grouping(const char *type, int *mindev, int *maxdev) * Extract the configuration parameters encoded in the dRAID type and * use them to generate a dRAID configuration. The expected format is: * - * draid[][:][:][:] + * draid[][:d][:c][:s][:w] * * The intent is to be able to generate a good configuration when no * additional information is provided. The only mandatory component * of the 'type' is the 'draid' prefix. If a value is not provided * then reasonable defaults are used. The optional components may - * appear in any order but the d/s/c suffix is required. + * appear in any order but the d/s/c/w suffix is required. * * Valid inputs: * - data: number of data devices per group (1-255) - * - parity: number of parity blocks per group (1-3) - * - spares: number of distributed spare (0-100) - * - children: total number of devices (1-255) + * - parity: number of parity devices per group (1-3) + * - children: total number of devices in slice (1-255) + * - width: total number of devices, multiple of children (1-255 for now) + * - spares: number of distributed spare devices (0-100), must be + * multiple of failure groups (width / children) * * Examples: * - zpool create tank draid * - zpool create tank draid2:8d:51c:2s + * - zpool create tank draid2:8d:12c:96w:8s */ static int -draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t width, + int nfgroup, int nfdomain) { uint64_t nparity; uint64_t nspares = 0; uint64_t ndata = UINT64_MAX; uint64_t ngroups = 1; + uint64_t children = 0; long value; if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) return (EINVAL); + if (nfgroup && nfdomain) /* must be only one of two or none */ + return (EINVAL); + nparity = (uint64_t)get_parity(type); if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { fprintf(stderr, @@ -1376,24 +1384,35 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) return (EINVAL); } - /* Expected non-zero value with c/d/s suffix */ + /* Expected non-zero value with c/d/s/w suffix */ value = strtol(p, &end, 10); char suffix = tolower(*end); if (errno != 0 || - (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (suffix != 'c' && suffix != 'd' && suffix != 's' && + suffix != 'w')) { (void) fprintf(stderr, gettext("invalid dRAID " - "syntax; expected [:] not '%s'\n"), - type); + "syntax; expected [:], " + "not '%s'\n"), type); return (EINVAL); } if (suffix == 'c') { - if ((uint64_t)value != children) { + if ((uint64_t)value > width || + width % (uint64_t)value != 0) { fprintf(stderr, - gettext("invalid number of dRAID children; " + gettext("invalid number of dRAID disks; " + "multiple of %llu required but %llu " + "provided\n"), (u_longlong_t)value, + (u_longlong_t)width); + return (EINVAL); + } + children = value; + } else if (suffix == 'w') { + if ((uint64_t)value != width) { + fprintf(stderr, + gettext("invalid number of dRAID disks; " "%llu required but %llu provided\n"), - (u_longlong_t)value, - (u_longlong_t)children); + (u_longlong_t)value, (u_longlong_t)width); return (EINVAL); } } else if (suffix == 'd') { @@ -1405,6 +1424,42 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) } } + if (!children && nfgroup) + children = width / nfgroup; + if (!children && nfdomain) + children = nfdomain; + if (!children) + children = width; + + int fgrps = width / children; + + if ((nspares % fgrps) != 0) { + fprintf(stderr, gettext("invalid number of distributed spares " + "%llu, must be multiple of failure groups %d\n"), + (u_longlong_t)nspares, fgrps); + return (EINVAL); + } + + if (fgrps == 1 && (nfgroup || nfdomain)) { + fprintf(stderr, gettext("failure domains are not set " + "in dRAID vdev descriptor\n")); + return (EINVAL); + } + + if (fgrps > 1 && nfgroup && fgrps != nfgroup) { + fprintf(stderr, gettext("invalid number of failure groups " + "%d, must be %d\n"), nfgroup, fgrps); + return (EINVAL); + } + + if (fgrps > 1 && nfdomain && nfdomain != children) { + fprintf(stderr, gettext("invalid number of failure domains " + "%d, must be %lu\n"), nfdomain, children); + return (EINVAL); + } + + nspares /= fgrps; + /* * When a specific number of data disks is not provided limit a * redundancy group to 8 data disks. This value was selected to @@ -1414,8 +1469,8 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) if (children > nspares + nparity) { ndata = MIN(children - nspares - nparity, 8); } else { - fprintf(stderr, gettext("request number of " - "distributed spares %llu and parity level %llu\n" + fprintf(stderr, gettext("requested number of " + "distributed spares %llu and parity level %llu " "leaves no disks available for data\n"), (u_longlong_t)nspares, (u_longlong_t)nparity); return (EINVAL); @@ -1450,7 +1505,7 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) (u_longlong_t)(ndata + nparity + nspares)); } - if (children > VDEV_DRAID_MAX_CHILDREN) { + if (width > VDEV_DRAID_MAX_CHILDREN) { fprintf(stderr, gettext("%llu disks were provided, but " "dRAID only supports up to %u disks"), (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); @@ -1467,8 +1522,9 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) /* Store the basic dRAID configuration. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares * fgrps); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, children); return (0); } @@ -1606,10 +1662,41 @@ construct_spec(nvlist_t *props, int argc, char **argv) nlogs++; } + int nfdomain = 0, nfgroup = 0; + int fdndev = 0, fgndev = 0; + int fdndev_prev = 0, fgndev_prev = 0; + for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + if (strcmp(argv[c], "fgroup") == 0 || + strcmp(argv[c], "failure_group") == 0) { + if (fgndev_prev && + fgndev_prev != fgndev) + break; + fgndev_prev = fgndev; + fgndev = 0; + nfgroup++; + continue; + } + + if (strcmp(argv[c], "fdomain") == 0 || + strcmp(argv[c], "failure_domain") == 0) { + if (fdndev_prev && + fdndev_prev != fdndev) + break; + fdndev_prev = fdndev; + fdndev = 0; + nfdomain++; + continue; + } + + if (nfgroup) + fgndev++; + if (nfdomain) + fdndev++; + children++; child = realloc(child, children * sizeof (nvlist_t *)); @@ -1647,6 +1734,81 @@ construct_spec(nvlist_t *props, int argc, char **argv) goto spec_out; } + if ((nfdomain || nfgroup) && + strcmp(type, VDEV_TYPE_DRAID) != 0) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s is not dRAID and cannot " + "have failure domains\n"), argv[0]); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (nfgroup && nfdomain) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has mixed configuration " + "of %d failure groups and %d failure " + "domains, it must have either fgroups or " + "fdomains, not both\n"), argv[0], + nfgroup, nfdomain); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (nfgroup == 1 || nfdomain == 1) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has only one failure %s " + "configured, it must be more than one\n"), + argv[0], nfgroup ? "group" : "domain"); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (fgndev_prev != fgndev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has different number of " + "devices in failure group %d than in " + "previous group: %d != %d\n"), argv[0], + nfgroup, fgndev, fgndev_prev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (fdndev_prev != fdndev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has different number of " + "devices in failure domain %d than in " + "previous domain: %d != %d\n"), argv[0], + nfdomain, fdndev, fdndev_prev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (nfdomain) { + /* Put children in the right order */ + nvlist_t **ch = NULL; + ch = realloc(ch, + children * sizeof (nvlist_t *)); + if (ch == NULL) + zpool_no_memory(); + int dlen = children / nfdomain; + int i = 0; + for (int g = 0; g < dlen; g++) + for (int d = 0; d < nfdomain; d++) + ch[i++] = child[g + (d * dlen)]; + free(child); + child = ch; + } + argc -= c; argv += c; @@ -1692,7 +1854,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) } if (strcmp(type, VDEV_TYPE_DRAID) == 0) { if (draid_config_by_type(nv, - fulltype, children) != 0) { + fulltype, children, nfgroup, + nfdomain) != 0) { for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); diff --git a/include/libzfs.h b/include/libzfs.h index 0ff3948e117b..f3bef7af62d5 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -443,6 +443,7 @@ typedef enum { * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ + ZPOOL_STATUS_FAULTED_FDOM_R, /* faulted fdomain with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index de2149641d21..9f3ff814c5ee 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -392,6 +392,8 @@ typedef enum { VDEV_PROP_AUTOSIT, VDEV_PROP_SLOW_IO_EVENTS, VDEV_PROP_SCHEDULER, + VDEV_PROP_FDOMAIN, + VDEV_PROP_FGROUP, VDEV_NUM_PROPS } vdev_prop_t; @@ -926,6 +928,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" +#define ZPOOL_CONFIG_DRAID_NCHILDREN "draid_nchildren" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h index e923092a39ad..e51a1a59f00b 100644 --- a/include/sys/vdev_draid.h +++ b/include/sys/vdev_draid.h @@ -68,9 +68,10 @@ typedef struct vdev_draid_config { */ uint64_t vdc_ndata; /* # of data devices in group */ uint64_t vdc_nparity; /* # of parity devices in group */ - uint64_t vdc_nspares; /* # of distributed spares */ + uint64_t vdc_nspares; /* # of distributed spares in slice */ uint64_t vdc_children; /* # of children */ uint64_t vdc_ngroups; /* # groups per slice */ + uint64_t vdc_width; /* # multiple of children */ /* * Immutable derived constants. @@ -103,7 +104,9 @@ extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); /* Functions for dRAID distributed spares. */ extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); -extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); +extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t *, + uint64_t); +extern boolean_t vdev_draid_fail_domain_allowed(vdev_t *); #ifdef __cplusplus } diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 56382ca85b55..64606de226b0 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -90,6 +90,7 @@ typedef enum spa_feature { SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURE_BLOCK_CLONING_ENDIAN, SPA_FEATURE_PHYSICAL_REWRITE, + SPA_FEATURE_DRAID_FAIL_DOMAINS, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 876433c0ba58..9ce5d719cdfa 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -690,7 +690,7 @@ - + @@ -6175,18 +6175,19 @@ - - - - - - - - - - - - + + + + + + + + + + + + + @@ -6258,7 +6259,9 @@ - + + + @@ -6542,7 +6545,8 @@ - + + @@ -9909,8 +9913,8 @@ - - + + @@ -9971,7 +9975,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index e12308b01ab1..66b6f4fe448a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1453,11 +1453,17 @@ zpool_has_draid_vdev(nvlist_t *nvroot) */ static char * zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, - uint64_t spares, uint64_t children) + uint64_t spares, uint64_t children, uint64_t width) { - snprintf(name, len, "%s%llu:%llud:%lluc:%llus", - VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, - (u_longlong_t)children, (u_longlong_t)spares); + if (children < width) + snprintf(name, len, "%s%llu:%llud:%lluc:%lluw:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)width, + (u_longlong_t)spares); + else + snprintf(name, len, "%s%llu:%llud:%lluc:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)spares); return (name); } @@ -4584,12 +4590,12 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, * If it's a dRAID device, we add parity, groups, and spares. */ if (strcmp(path, VDEV_TYPE_DRAID) == 0) { - uint64_t ndata, nparity, nspares; + uint64_t ndata, nparity, nspares, children; nvlist_t **child; - uint_t children; + uint_t width; verify(nvlist_lookup_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + ZPOOL_CONFIG_CHILDREN, &child, &width) == 0); nparity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); ndata = fnvlist_lookup_uint64(nv, @@ -4597,8 +4603,12 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, nspares = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES); + if (nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NCHILDREN, &children) != 0) + children = width; + path = zpool_draid_name(buf, sizeof (buf), ndata, - nparity, nspares, children); + nparity, nspares, children, width); } /* @@ -5522,6 +5532,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: + case VDEV_PROP_FDOMAIN: + case VDEV_PROP_FGROUP: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index a589ca6896f0..d39172f45008 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -154,8 +154,12 @@ vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc, void *arg) /* * Detect if any leaf devices that have seen errors or could not be opened. + * Returns: + * - EDOM if a failure domain in dRAID vdev is down + * - ENXIO if any device is problematic + * - 0 (zero) otherwise */ -static boolean_t +static int find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), void *arg, boolean_t ignore_replacing) { @@ -172,22 +176,41 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), const char *type = fnvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); + return (0); } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { + + uint64_t fgrp_children = 0; + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_DRAID_NCHILDREN, + &fgrp_children); + + for (c = 0; c < fgrp_children; c++) { + int nfgrps = children / fgrp_children; + int nfaults = 0; + for (int g = 0; g < nfgrps; g++) { + if (find_vdev_problem(child[c + + (g * fgrp_children)], func, arg, + ignore_replacing)) + nfaults++; + } + if (nfaults == nfgrps) + return (EDOM); + } + for (c = 0; c < children; c++) { - if (find_vdev_problem(child[c], func, arg, - ignore_replacing)) - return (B_TRUE); + int res; + if ((res = find_vdev_problem(child[c], func, arg, + ignore_replacing))) + return (res); } } else { uint_t vsc; vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array( vdev, ZPOOL_CONFIG_VDEV_STATS, &vsc); if (func(vs, vsc, arg) != 0) - return (B_TRUE); + return (ENXIO); } /* @@ -198,11 +221,11 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), for (c = 0; c < children; c++) { if (find_vdev_problem(child[c], func, arg, ignore_replacing)) - return (B_TRUE); + return (ENXIO); } } - return (B_FALSE); + return (0); } /* @@ -406,6 +429,10 @@ check_status(nvlist_t *config, boolean_t isimport, /* * Missing devices in a replicated config. */ + if (find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE) == EDOM) + return (ZPOOL_STATUS_FAULTED_FDOM_R); + if (find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE) == EDOM) + return (ZPOOL_STATUS_FAULTED_FDOM_R); if (find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); if (find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE)) diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 3b65a52ae630..02ad1df48230 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -64,7 +64,7 @@ The values of non-numeric properties are case sensitive and must be lowercase. The following native properties consist of read-only statistics about the vdev. These properties can not be changed. -.Bl -tag -width "fragmentation" +.Bl -tag -width "failure_domain" .It Sy capacity Percentage of vdev space used .It Sy state @@ -89,6 +89,14 @@ How much this vdev can expand by Percent of fragmentation in this vdev .It Sy parity The level of parity for this vdev +.It Sy failure_domain +Failure domain id of this child vdev in +.Sy dRAID +vdev with failure domains feature +.It Sy failure_group +Failure group id of this child vdev in +.Sy dRAID +vdev with failure domains feature .It Sy devid The device id for this vdev .It Sy physpath @@ -114,7 +122,7 @@ threshold in milliseconds For .Sy RAIDZ and -.Sy DRAID +.Sy dRAID configurations, this value also represents the number of times the vdev was identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops @@ -166,7 +174,7 @@ failfast. Only valid for .Sy RAIDZ and -.Sy DRAID +.Sy dRAID vdevs. True when a slow disk outlier was detected and the vdev is currently in a sit out state. @@ -180,7 +188,7 @@ data will be reconstructed as needed from parity. Only valid for .Sy RAIDZ and -.Sy DRAID +.Sy dRAID vdevs. If set, this enables the kernel-level slow disk detection logic. This logic automatically causes any vdevs that are significant negative diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b4404a6eb58d..aeedaaca5a25 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -504,6 +504,32 @@ vdev type, or when adding a new .Sy draid vdev to an existing pool. . +.feature com.seagate draid_failure_domains no draid +This feature enables use of failure domains in +.Sy draid +vdev type. +Failure domains allow for an entire set of devices that belong to a domain +to fail without taking the pool offline. +Devices that are likely to fail together due to sharing a common component, +such as an enclosure, HBA, or SAS expander, are good candidates to form a +failure domain. +For example, on a setup with several enclosures the user defines a failure +domain for each enclosure with all its devices and can arrange devices into +failure groups in such a way that every i-th device in every group belongs +to i-th enclosure. +This will allow tolerating the failure of the whole enclosure. +The size of the failure group is equal to the number of failure domains, +and it cannot be less than the size of the redundancy group +(parity + data + spares). +.Pp +This feature becomes +.Sy active +when creating a pool which uses the +.Sy draid +vdev type with failure domains configured, or when adding a new +.Sy draid +vdev with failure domains to an existing pool. +. .feature com.klarasystems dynamic_gang_header no This feature enables larger gang headers based on the sector size of the pool. When enabled, gang headers will use the entire space allocated for them, instead diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 07b78dda2396..ba7cd6399fa1 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -26,6 +26,7 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2026 Seagate Technology, LLC. .\" .Dd August 6, 2025 .Dt ZPOOLCONCEPTS 7 @@ -142,7 +143,7 @@ A dRAID with .No parity level, and Em S No distributed hot spares can hold approximately .Em (N-S)*(D/(D+P))*X No bytes and can withstand Em P devices failing without losing data. -.It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc +.It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar width Ns Sy w Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc A non-default dRAID configuration can be specified by appending one or more of the following optional arguments to the .Sy draid @@ -161,9 +162,34 @@ Defaults to The expected number of children. Useful as a cross-check when listing a large number of devices. An error is returned when the provided number of children differs. +.It Ar width +You can configure several groups of children in the same row, in which case +.Em width No would be a multiple of Em children . +Such configurations allow the creation of failure groups with every i-th device +in each group being from different failure domain (for example an enclosure) +so that if all devices in one domain fail, the +.Em draid No vdev still will be operational with enough redundancy to +rebuild the data. +In case of +.Em draid2 , No two domains can fail at a time, in case of +.Em draid3 No \(em three domains (provided there are no other failures +in any failure group). +For each group, it will be only one, two or three failures. .It Ar spares The number of distributed hot spares. +If failure domains are configured +.Em ( width No > Em children ) , No it must be a +multiple of the number of failure groups so that each group has the same +number of spares. +All spares are shared between failure groups. Defaults to zero. +.Pp +Note: to support domain failure, we cannot have more than +.Em parity-1 No failures in any failure group, no matter if the failed +devices are rebuilt to draid hot spares or not \(em the blocks of those +spares can be mapped to the devices from the failed domain, and we cannot +tolerate more than +.Em parity No failures in any failure group . .El .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. @@ -202,6 +228,10 @@ A cache device cannot be configured as a mirror or raidz group. For more information, see the .Sx Cache Devices section. +.It Sy fdomain No or Sy failure_domain +Denotes the list of failure domain devices for dRAID vdev. +.It Sy fgroup No or Sy failure_group +Denotes the list of failure group devices for dRAID vdev. .El .Pp Virtual devices cannot be nested arbitrarily. @@ -364,7 +394,13 @@ pools. The .Sy draid vdev type provides distributed hot spares. -These hot spares are named after the dRAID vdev they're a part of +These are virtual devices whose blocks are reserved and distributed among +all real devices, which makes resilvering to them much faster because one +device is not a bottleneck anymore. +Fast resilvering is crucial for data durability, it decreases the time of +having degraded data redundancy in the pool, thus decreasing the chance of +losing more devices at a time than we can tolerate. +dRAID hot spares are named after the draid vdev they're a part of .Po Sy draid1 Ns - Ns Ar 2 Ns - Ns Ar 3 No specifies spare Ar 3 No of vdev Ar 2 , .No which is a single parity dRAID Pc and may only be used by that dRAID vdev. diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 index a36ae260a158..d5696ad85f6b 100644 --- a/man/man8/zpool-create.8 +++ b/man/man8/zpool-create.8 @@ -239,6 +239,41 @@ The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf . +.Ss Example 7 : No Creating a ZFS Pool with dRAID vdev +The following command creates a ZFS storage pool with dRAID vdev +with one parity, four data and one spare devices, 6 devices in total: +.Dl # Nm zpool Cm create Ar pool Sy draid1:4d:6c:1s Pa sda sdb sdc sdd sde sdf +. +.Ss Example 8 : No Creating a ZFS Pool with dRAID vdev with failure domains +The following commands create a ZFS storage pool with dRAID vdev +with five failure groups and six failure domains (for example, enclosures). +The commands are equivalent: +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm create Ar pool Sy draid1:4d:6c:30w:5s No \e + \fIenc0d0 enc1d0 enc2d0 enc3d0 enc4d0 enc5d0\fP \e + \fIenc0d1 enc1d1 enc2d1 enc3d1 enc4d1 enc5d1\fP \e + \fIenc0d2 enc1d2 enc2d2 enc3d2 enc4d2 enc5d2\fP \e + \fIenc0d3 enc1d3 enc2d3 enc3d3 enc4d3 enc5d3\fP \e + \fIenc0d4 enc1d4 enc2d4 enc3d4 enc4d4 enc5d4\fP +.Ed +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm create Ar pool Sy draid1:5s No \e + \fBfgroup\fP \fIenc0d0 enc1d0 enc2d0 enc3d0 enc4d0 enc5d0\fP \e + \fBfgroup\fP \fIenc0d1 enc1d1 enc2d1 enc3d1 enc4d1 enc5d1\fP \e + \fBfgroup\fP \fIenc0d2 enc1d2 enc2d2 enc3d2 enc4d2 enc5d2\fP \e + \fBfgroup\fP \fIenc0d3 enc1d3 enc2d3 enc3d3 enc4d3 enc5d3\fP \e + \fBfgroup\fP \fIenc0d4 enc1d4 enc2d4 enc3d4 enc4d4 enc5d4\fP +.Ed +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm create Ar pool Sy draid1:5s No \e + \fBfdomain\fP \fIenc0d0 enc0d1 enc0d2 enc0d3 enc0d4\fP \e + \fBfdomain\fP \fIenc1d0 enc1d1 enc1d2 enc1d3 enc1d4\fP \e + \fBfdomain\fP \fIenc2d0 enc2d1 enc2d2 enc2d3 enc2d4\fP \e + \fBfdomain\fP \fIenc3d0 enc3d1 enc3d2 enc3d3 enc3d4\fP \e + \fBfdomain\fP \fIenc4d0 enc4d1 enc4d2 enc4d3 enc4d4\fP \e + \fBfdomain\fP \fIenc5d0 enc5d1 enc5d2 enc5d3 enc5d4\fP +.Ed +. .Sh SEE ALSO .Xr zpool-destroy 8 , .Xr zpool-export 8 , diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ba9892eeb64..2bb19c0cf5fd 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -696,6 +696,19 @@ zpool_feature_init(void) "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t draid_fdomain_deps[] = { + SPA_FEATURE_DRAID, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_DRAID_FAIL_DOMAINS, + "com.seagate:draid_failure_domains", + "draid_failure_domains", + "Support for failure domains in dRAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, + draid_fdomain_deps, sfeatures); + } + { static const spa_feature_t zilsaxattr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 2c6515e93676..78ee3f783ecb 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -392,6 +392,12 @@ vdev_prop_init(void) ZFS_TYPE_VDEV, "", "ASHIFT", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_PARITY, "parity", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "PARITY", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_FDOMAIN, "failure_domain", UINT64_MAX, + PROP_READONLY, ZFS_TYPE_VDEV, "", "FDOM", B_FALSE, + sfeatures); + zprop_register_number(VDEV_PROP_FGROUP, "failure_group", UINT64_MAX, + PROP_READONLY, ZFS_TYPE_VDEV, "", "FGRP", B_FALSE, + sfeatures); zprop_register_number(VDEV_PROP_NUMCHILDREN, "numchildren", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "NUMCHILD", B_FALSE, sfeatures); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 843b1b9d66bb..c4a691e47d93 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7028,10 +7028,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj, ndraid = 0; + uint64_t version, obj, ndraid = 0, draid_nfgroup = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; + boolean_t has_draid; + boolean_t has_draid_fdomains; spa_feature_t feat; const char *feat_name; const char *poolname; @@ -7078,6 +7080,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; + has_draid = B_FALSE; + has_draid_fdomains = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { @@ -7089,6 +7093,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; + if (feat == SPA_FEATURE_DRAID) + has_draid = B_TRUE; + if (feat == SPA_FEATURE_DRAID_FAIL_DOMAINS) + has_draid_fdomains = B_TRUE; } } @@ -7152,7 +7160,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, + &draid_nfgroup, 0)) == 0 && + (ndraid == 0 || has_draid || (error = SET_ERROR(ENOTSUP))) && + (draid_nfgroup == 0 || has_draid_fdomains || + (error = SET_ERROR(ENOTSUP))) && error == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) @@ -7303,6 +7315,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + for (int i = 0; i < draid_nfgroup; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID_FAIL_DOMAINS, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -7899,13 +7914,26 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } +/* + * This is called as a synctask to increment the draid_fail_domains feature flag + */ +static void +spa_draid_fdomains_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int nfgrp = (int)(uintptr_t)arg; + + for (int c = 0; c < nfgrp; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID_FAIL_DOMAINS, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { - uint64_t txg, ndraid = 0; + uint64_t txg, ndraid = 0, draid_nfgroup = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -7944,10 +7972,15 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) * dRAID is stored in the config and used when opening the spare. */ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, - rvd->vdev_children)) == 0) { + &draid_nfgroup, rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; + + if (draid_nfgroup > 0 && !spa_feature_is_enabled(spa, + SPA_FEATURE_DRAID_FAIL_DOMAINS)) + return (spa_vdev_exit(spa, vd, txg, ENOTSUP)); } else { return (spa_vdev_exit(spa, vd, txg, error)); } @@ -8034,8 +8067,15 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, (void *)(uintptr_t)ndraid, tx); + + if (draid_nfgroup > 0) + dsl_sync_task_nowait(spa->spa_dsl_pool, + spa_draid_fdomains_feature_incr, + (void *)(uintptr_t)draid_nfgroup, tx); + dmu_tx_commit(tx); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3480b884ea96..d78f19db54ed 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3425,23 +3425,51 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, /* leaf vdevs only */ continue; } + int children = vd->vdev_children; + int width = children; if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; + if (vd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = vd->vdev_tsd; + minref = vdc->vdc_nparity + 1; + children = vdc->vdc_children; + } } else { /* any kind of mirror */ minref = vd->vdev_children; } + /* + * For dRAID with failure domains, count failures + * only once for any i-th child failure in each failure + * group, but only if the failures threshold is not + * reached in any of the groups. + */ + boolean_t safe2skip = B_FALSE; + if (width > children && + vdev_draid_fail_domain_allowed(vd)) + safe2skip = B_TRUE; + space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, - cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); + for (int c = 0; c < children; c++) { + for (int i = c; i < width; i += children) { + vdev_t *cvd = vd->vdev_child[i]; + + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + boolean_t empty = + zfs_range_tree_is_empty( + cvd->vdev_dtl[s]); + mutex_exit(&cvd->vdev_dtl_lock); + + if (s == DTL_OUTAGE && !empty && + safe2skip) + break; + } } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); @@ -6290,6 +6318,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } +static int +vdev_get_child_idx(vdev_t *vd, uint64_t c_guid) +{ + for (int c = 0; c < vd->vdev_children; c++) + if (vd->vdev_child[c]->vdev_guid == c_guid) + return (c); + return (0); +} + int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { @@ -6396,6 +6433,25 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; + case VDEV_PROP_FDOMAIN: + case VDEV_PROP_FGROUP: + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + vd->vdev_top->vdev_ops == + &vdev_draid_ops) { + vdev_draid_config_t *vdc = + vd->vdev_top->vdev_tsd; + if (vdc->vdc_width == vdc->vdc_children) + continue; + int c_idx = vdev_get_child_idx( + vd->vdev_top, vd->vdev_guid); + vdev_prop_add_list(outnvl, propname, + NULL, prop == VDEV_PROP_FDOMAIN ? + (c_idx % vdc->vdc_children) : + (c_idx / vdc->vdc_children), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8588cfee3f7d..6e23c6e24bfb 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -23,6 +23,7 @@ * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2026, Seagate Technology, LLC. */ #include @@ -140,6 +141,58 @@ * the same for all groups (although some of the logic around computing * permutation numbers and drive offsets is more complicated). * + * === dRAID failure domains === + * + * If we put several slices alongside in a row and configure each disk in + * slice to be from different failure domain (for example an enclosure), we + * can then tolerate the failure of the whole domain -- only one device + * will be failed in every slice in this case. The column of such slices + * we will call failure group, and the row with such slices alongside we + * will call "big width row", width being multiple of children (W = C*n). + * + * Here's an example of configuration with 7 failure domains and two + * failure groups: + * + * 7 C disks in each slice, 2 slices in big 14 W rows + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 1 | 7 | 3 | 9 | 11| 5 | 13| 6 | 10| 4 | 8 | 0 | 12| 2 | device map 0 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 0 | gr1..| S | group 3 | gr4.. | S | row 0 + * c +-------+-------+-------+---+-------+-------+-------+---+ + * 0,1 | ..gr1 | group 2 | S | ..gr4 | group 5 | S | row 1 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 2 | 10| 12| 7 | 8 | 13| 11| 1 | 5 | 4 | 6 | 3 | 9 | 0 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 6 | gr7..| S | group 9 |gr10.. | S | row 2 + * c +-------+-------+-------+---+---------------+-------+---+ + * 2,3 | ..gr7 | group 8 | S |..gr10 | group 11 | S | row 3 + * +-------+---------------+---+-------+---------------+---+ + * failure group 0 failure group 1 + * + * In practice, there might be much more failure groups. And in theory, the + * width of the big rows can be much larger than curent limit of 255 imposed + * for the number of children. But we kept the same limit for now for the + * sake of simplicity of implementation. + * + * In order to preserve fast sequential resilvering in case of a disk failure, + * all failure groups much share all disks between themselves, and this is + * achieved by shuffling the disks between the groups. But only i-th disks + * in each group are shuffled between themselves, i.e. the disks from the + * same failure domains (enclosures). After that, they are shuffled within + * each group. Thus, no more than one disk from any failure domain can appear + * in any failure group as a result of this shuffling. In the above example, + * you won't find any tuple of (0, 7) or (1, 8) or (2, 9) or ... (6, 13) + * mapped to the same slice. This is done in vdev_draid_shuffle_perms(). + * + * Spare disks are evenly distributed among failure groups, so the number of + * spares should be multiple of the number of groups, and they are shared by + * all groups. However, to support domain failure, we cannot have more than + * nparity - 1 failed disks in any group, no matter if they are rebuilt to + * draid spares or not (the blocks of those spares can be mapped to the disks + * from the failed domain (enclosure), and we cannot tolerate more than + * nparity failures in any failure group). + * + * * N.B. The following array describes all valid dRAID permutation maps. * Each row is used to generate a permutation map for a different number * of children from a unique seed. The seeds were generated and carefully @@ -537,6 +590,73 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) return (0); } +static void +vdev_draid_swap_perms(uint8_t *perms, uint64_t i, uint64_t j) +{ + uint8_t val = perms[i]; + + perms[i] = perms[j]; + perms[j] = val; +} + +/* + * Shuffle every i-th disk in slices that lie alongside in the big width row, + * increasing disk indices in each next slice in the row accordingly. The + * input to this function is the array of ready permutations from + * vdev_draid_generate_perms(), so in order to correctly shuffle i-th disks, + * we need to locate their position first and build a map of their locations. + * + * Note: the same Fisher-Yates shuffle algorithm is used as in + * vdev_draid_generate_perms(). + */ +static void +vdev_draid_shuffle_perms(const draid_map_t *map, uint8_t *perms, uint64_t width) +{ + uint64_t cn = map->dm_children; + uint64_t n = width / cn; + uint64_t nperms = map->dm_nperms / n * n; + + if (width <= cn) + return; + + VERIFY3U(width, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(width, <=, VDEV_DRAID_MAX_CHILDREN); + ASSERT0(width % cn); + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + + uint8_t *cmap = kmem_alloc(n, KM_SLEEP); + + for (int i = 0; i < nperms; i += n) { + for (int j = 0; j < cn; j++) { + + /* locate position of the same child in other slices */ + for (int k = n - 1; k > 0; k--) + for (int l = 0; l < cn; l++) + if (perms[(i+k) * cn + l] == + perms[(i+0) * cn + j]) + cmap[k] = l; + cmap[0] = j; + + /* increase index values for slices on the right */ + for (int k = n - 1; k > 0; k--) + perms[(i+k) * cn + cmap[k]] += k * cn; + + /* shuffle */ + for (int k = n - 1; k > 0; k--) { + int l = vdev_draid_rand(draid_seed) % (k + 1); + if (k == l) + continue; + vdev_draid_swap_perms(perms, + (i+k) * cn + cmap[k], + (i+l) * cn + cmap[l]); + } + } + } + + kmem_free(cmap, n); +} + /* * Lookup the fixed draid_map_t for the requested number of children. */ @@ -560,17 +680,26 @@ static void vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, uint8_t **base, uint64_t *iter) { + uint64_t n = vdc->vdc_width / vdc->vdc_children; uint64_t ncols = vdc->vdc_children; - uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + uint64_t nperms = (vdc->vdc_nperms / n) * n; + uint64_t poff = pindex % (nperms * ncols); + + ASSERT3P(nperms, >=, ncols * n); - *base = vdc->vdc_perms + (poff / ncols) * ncols; - *iter = poff % ncols; + *base = vdc->vdc_perms + (poff / (ncols * n)) * (ncols * n); + *iter = (poff % ncols) + (pindex % n) * ncols; } static inline uint64_t vdev_draid_permute_id(vdev_draid_config_t *vdc, uint8_t *base, uint64_t iter, uint64_t index) { + if (vdc->vdc_width > vdc->vdc_children) { + uint64_t off = (iter / vdc->vdc_children) * vdc->vdc_children; + return (base[(index + iter) % vdc->vdc_children + off]); + } + return ((base[index] + iter) % vdc->vdc_children); } @@ -949,7 +1078,8 @@ vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, * - so we need to find the row where this IO group target begins */ *perm = group / ngroups; - uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + uint64_t n = vdc->vdc_width / vdc->vdc_children; + uint64_t row = ((*perm / n) * ((groupwidth * ngroups) / ndisks)) + (((group % ngroups) * groupwidth) / ndisks); return (((rowheight_sectors * row) + @@ -1170,8 +1300,11 @@ vdev_draid_min_asize(vdev_t *vd) ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + uint64_t ndisks = vdc->vdc_ndisks * + (vdc->vdc_width / vdc->vdc_children); + return (VDEV_DRAID_REFLOW_RESERVE + - (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); + (vd->vdev_min_asize + ndisks - 1) / ndisks); } /* @@ -1535,7 +1668,7 @@ vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, int open_errors = 0; if (nparity > VDEV_DRAID_MAXPARITY || - vd->vdev_children < nparity + 1) { + vdc->vdc_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } @@ -1548,12 +1681,26 @@ vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children_subset(vd, vdev_draid_open_children); vdev_open_children_subset(vd, vdev_draid_open_spares); - /* Verify enough of the children are available to continue. */ - for (int c = 0; c < vd->vdev_children; c++) { - if (vd->vdev_child[c]->vdev_open_error != 0) { - if ((++open_errors) > nparity) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (SET_ERROR(ENXIO)); + /* + * Verify enough of the children are available to continue. + * If several disks got failed on i-th position in each slice in the + * big width row (failure groups) - they are counted as one failure, + * but only if the failures threshold is not reached in any group. + */ + boolean_t safe2skip = B_FALSE; + if (vdc->vdc_width > vdc->vdc_children && + vdev_draid_fail_domain_allowed(vd)) + safe2skip = B_TRUE; + for (int c = 0; c < vdc->vdc_children; c++) { + for (int i = c; i < vdc->vdc_width; i += vdc->vdc_children) { + if (vd->vdev_child[i]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = + VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + if (safe2skip) + break; } } } @@ -1588,6 +1735,19 @@ vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * vdc->vdc_groupsz); + /* + * For failure groups with multiple silices in the big width row, + * round down to slice size and multiply on the number of slices + * in the "big width row" so that each failure group would have + * the same number of slices. + */ + if (vdc->vdc_width > vdc->vdc_children) { + uint64_t slicesz = vdc->vdc_devslicesz * vdc->vdc_ndisks; + uint64_t n = (vdc->vdc_width / vdc->vdc_children); + *asize = (*asize / slicesz) * slicesz * n; + *max_asize = (*max_asize / slicesz) * slicesz * n; + } + return (0); } @@ -1674,10 +1834,11 @@ vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) */ int vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, - uint64_t next_vdev_id) + uint64_t *nfgroupp, uint64_t next_vdev_id) { uint64_t draid_nspares = 0; uint64_t ndraid = 0; + uint64_t nfgroup = 0; int error; for (uint64_t i = 0; i < vd->vdev_children; i++) { @@ -1685,13 +1846,17 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, if (cvd->vdev_ops == &vdev_draid_ops) { vdev_draid_config_t *vdc = cvd->vdev_tsd; - draid_nspares += vdc->vdc_nspares; + draid_nspares += vdc->vdc_nspares * + (vdc->vdc_width / vdc->vdc_children); ndraid++; + if (vdc->vdc_width > vdc->vdc_children) + nfgroup++; } } if (draid_nspares == 0) { *ndraidp = ndraid; + *nfgroupp = nfgroup; return (0); } @@ -1718,7 +1883,8 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, continue; vdev_draid_config_t *vdc = cvd->vdev_tsd; - uint64_t nspares = vdc->vdc_nspares; + uint64_t nspares = vdc->vdc_nspares * + (vdc->vdc_width / vdc->vdc_children); uint64_t nparity = vdc->vdc_nparity; for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { @@ -1759,6 +1925,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, kmem_free(new_spares, sizeof (*new_spares) * n); *ndraidp = ndraid; + *nfgroupp = nfgroup; return (0); } @@ -2100,7 +2267,7 @@ vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT(vd->vdev_ops == &vdev_draid_ops); - if (faulted > vdc->vdc_nparity) + if (faulted > vdc->vdc_nparity * (vdc->vdc_width / vdc->vdc_children)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) @@ -2213,10 +2380,14 @@ vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); vdev_draid_config_t *vdc = vd->vdev_tsd; + int fgrps = vdc->vdc_width / vdc->vdc_children; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, + vdc->vdc_nspares * fgrps); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, vdc->vdc_children); } /* @@ -2237,24 +2408,30 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) return (SET_ERROR(EINVAL)); } - uint_t children; + uint_t width; + uint64_t children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0 || children == 0 || - children > VDEV_DRAID_MAX_CHILDREN) { + &child, &width) != 0 || width == 0) { return (SET_ERROR(EINVAL)); } - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || - nspares > 100 || nspares > (children - (ndata + nparity))) { - return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, &children)) { + children = width; + if (children > VDEV_DRAID_MAX_CHILDREN) + return (SET_ERROR(EINVAL)); } - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || - ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + if (children == 0 || width % children != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100) { return (SET_ERROR(EINVAL)); } + nspares /= (width / children); + /* * Validate the minimum number of children exist per group for the * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). @@ -2262,6 +2439,11 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) if (children < (ndata + nparity + nspares)) return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + /* * Create the dRAID configuration using the pool nvlist configuration * and the fixed mapping for the correct number of children. @@ -2279,6 +2461,7 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) vdc->vdc_nspares = nspares; vdc->vdc_children = children; vdc->vdc_ngroups = ngroups; + vdc->vdc_width = width; vdc->vdc_nperms = map->dm_nperms; error = vdev_draid_generate_perms(map, &vdc->vdc_perms); @@ -2287,6 +2470,9 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) return (SET_ERROR(EINVAL)); } + if (width > children) + vdev_draid_shuffle_perms(map, vdc->vdc_perms, width); + /* * Derived constants. */ @@ -2324,7 +2510,7 @@ vdev_draid_nparity(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; - return (vdc->vdc_nparity); + return (vdc->vdc_nparity * (vdc->vdc_width / vdc->vdc_children)); } static uint64_t @@ -2332,7 +2518,7 @@ vdev_draid_ndisks(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; - return (vdc->vdc_ndisks); + return (vdc->vdc_ndisks * (vdc->vdc_width / vdc->vdc_children)); } vdev_ops_t vdev_draid_ops = { @@ -2436,17 +2622,25 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) vdev_t *tvd = vds->vds_draid_vdev; vdev_draid_config_t *vdc = tvd->vdev_tsd; + uint64_t n = vdc->vdc_width / vdc->vdc_children; + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); - ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares * n); uint8_t *base; uint64_t iter; - uint64_t perm = physical_offset / vdc->vdc_devslicesz; + uint64_t perm = (physical_offset / vdc->vdc_devslicesz) * n; + + /* + * Adjust permutation so that it points to the correct slice in the + * big width row. + */ + perm += vds->vds_spare_id / vdc->vdc_nspares; vdev_draid_get_perm(vdc, perm, &base, &iter); uint64_t cid = vdev_draid_permute_id(vdc, base, iter, - (tvd->vdev_children - 1) - vds->vds_spare_id); + (vdc->vdc_children - 1) - (vds->vds_spare_id % vdc->vdc_nspares)); vdev_t *cvd = tvd->vdev_child[cid]; if (cvd->vdev_ops == &vdev_draid_spare_ops) @@ -2455,6 +2649,40 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) return (cvd); } +/* + * Returns true if no failure group reached failures threshold so that + * enclosure failure cannot be tolerated anymore. Used spares are counted + * as failures because in case of enclosure failure their blocks can belong + * to the disks from that enclosure and can be lost. + */ +boolean_t +vdev_draid_fail_domain_allowed(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3P(vdc->vdc_width, >, vdc->vdc_children); + + int counter = 0; + + for (int c = 0; c < vdc->vdc_width; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if ((c % vdc->vdc_children) == 0) + counter = 0; + + if (cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_draid_spare_ops || + !vdev_readable(cvd)) + counter++; + + if (counter > vdc->vdc_nparity) + return (B_FALSE); + } + + return (B_TRUE); +} + static void vdev_draid_spare_close(vdev_t *vd) { @@ -2496,7 +2724,8 @@ vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) return (SET_ERROR(EINVAL)); - if (vds->vds_spare_id >= vdc->vdc_nspares) + if (vds->vds_spare_id >= + vdc->vdc_nspares * (vdc->vdc_width / vdc->vdc_children)) return (SET_ERROR(EINVAL)); /* diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 520ddd692bda..17d9f2611611 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3271,11 +3271,18 @@ raidz_simulate_failure(int physical_width, int original_width, int ashift, static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { + vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; - int physical_width = zio->io_vd->vdev_children; + int physical_width = vd->vdev_children; + int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; + + if (vd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = vd->vdev_tsd; + physical_width = vdc->vdc_children; + } + int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; - int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " @@ -3465,9 +3472,17 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) static int vdev_raidz_combrec(zio_t *zio) { - int nparity = vdev_get_nparity(zio->io_vd); + vdev_t *vd = zio->io_vd; + int nparity = vdev_get_nparity(vd); raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; + + if (vd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = vd->vdev_tsd; + nparity = vdc->vdc_nparity; + physical_width = vdc->vdc_children; + } + int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8394bc4bcda0..5f6cb7e3dfbb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -423,6 +423,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', + 'zpool_create_draid_005_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', @@ -913,9 +914,10 @@ timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', - 'redundancy_draid3', 'redundancy_draid_damaged1', + 'redundancy_draid3', 'redundancy_draid_width', 'redundancy_draid_damaged1', 'redundancy_draid_damaged2', 'redundancy_draid_spare1', - 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', + 'redundancy_draid_spare2', 'redundancy_draid_spare3', + 'redundancy_draid_spare4', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2717bf53d0b1..c11d8dd545b8 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -124,7 +124,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', 'fault_limits', 'scrub_after_resilver', - 'suspend_on_probe_errors', 'suspend_resume_single', 'zpool_status_-s'] + 'suspend_on_probe_errors', 'suspend_resume_single', 'suspend_draid_fgroups', + 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index e3fcce9840d9..63e3caef7ad1 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1080,6 +1080,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \ functional/cli_root/zpool_create/zpool_create_encrypted.ksh \ functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \ @@ -1602,6 +1603,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/scrub_after_resilver.ksh \ functional/fault/suspend_on_probe_errors.ksh \ functional/fault/suspend_resume_single.ksh \ + functional/fault/suspend_draid_fgroups.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ @@ -1895,12 +1897,14 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ + functional/redundancy/redundancy_draid_width.ksh \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ functional/redundancy/redundancy_draid_spare3.ksh \ + functional/redundancy/redundancy_draid_spare4.ksh \ functional/redundancy/redundancy_mirror.ksh \ functional/redundancy/redundancy_raidz1.ksh \ functional/redundancy/redundancy_raidz2.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh new file mode 100755 index 000000000000..b6115e5c5e36 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh @@ -0,0 +1,149 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify creation of several failure groups in one big width row. +# +# STRATEGY: +# 1) Test valid stripe/spare/children/width combinations. +# 2) Test invalid stripe/spare/children/width combinations outside the +# allowed limits. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $draid_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid:#d:#c:#w:#s '" + +log_onexit cleanup + +mkdir $TESTDIR + +# Generate 10 random valid configurations to test. +for (( i = 0; i < 10; i++ )); do + parity=$(random_int_between 1 3) + spares=$(random_int_between 0 3) + data=$(random_int_between 1 10) + n=$(random_int_between 2 4) + + (( min_children = (data + parity + spares) )) + (( max_children = 64 / n )) + children=$(random_int_between $min_children $max_children) + (( width = (children * n) )) + (( spares *= n )) + + draid="draid${parity}:${data}d:${children}c:${width}w:${spares}s" + + draid_vdevs=$(echo $TESTDIR/file.{1..$width}) + log_must truncate -s $MINVDEVSIZE $draid_vdevs + + log_must zpool create $TESTPOOL $draid $draid_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + # create the same pool with fgroup keywords + draid_fgrp_vdevs="" + for (( g = 0; g < n; g++ )); do + draid_fgrp_vdevs+="fgroup " + for (( c = 0; c < children; c++ )); do + draid_fgrp_vdevs+="$TESTDIR/file.$((c + (g * children) + 1)) " + done + done + + log_must zpool create $TESTPOOL $draid $draid_fgrp_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + # create the same pool with fdomain keywords + draid_fdom_vdevs="" + for (( c = 0; c < children; c++ )); do + draid_fdom_vdevs+="fdomain " + for (( g = 0; g < n; g++ )); do + draid_fdom_vdevs+="$TESTDIR/file.$((c + (g * children) + 1)) " + done + done + + log_must zpool create $TESTPOOL $draid $draid_fgrp_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + rm -f $draid_vdevs +done + +children=32 +draid_vdevs=$(echo $TESTDIR/file.{1..$children}) +draid_vdevs0=$(echo $TESTDIR/file.{1..$((children / 2))}) +draid_vdevs1=$(echo $TESTDIR/file.{$((children / 2 + 1))..$children}) +draid_vdevs0_less=$(echo $TESTDIR/file.{1..$((children / 2 - 1))}) +draid_vdevs1_more=$(echo $TESTDIR/file.{$((children / 2))..$children}) +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +# Exceeds maximum data disks (limited by total children) +log_must zpool create $TESTPOOL draid2:14d:32w $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:14d:33w $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:14d:31w $draid_vdevs + +# One fdomain or fgroup keyword is not enough +log_mustnot zpool create $TESTPOOL draid2:14d:32w fdomain $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:14d:32w fgroup $draid_vdevs + +# The number of devices should be equal after each fdomain or fgroup +log_mustnot zpool create $TESTPOOL draid2:14d:32w fdomain $draid_vdevs0_less fdomain $draid_vdevs1_more +log_mustnot zpool create $TESTPOOL draid2:14d:32w fgroup $draid_vdevs0_less fgroup $draid_vdevs1_more + +# Keywords cannot be mixed +log_mustnot zpool create $TESTPOOL draid2:14d:32w fdomain $draid_vdevs0 fgroup $draid_vdevs1 + +# Failure groups and domains can be inferred from keywords +log_must zpool create $TESTPOOL draid2:14d fgroup $draid_vdevs0 fgroup $draid_vdevs1 +log_must poolexists $TESTPOOL +log_must test "$(get_vdev_prop failure_group $TESTPOOL draid2:14d:16c:32w-0)" == "-" +log_must destroy_pool $TESTPOOL +log_must zpool create $TESTPOOL draid1 fdomain $draid_vdevs0 fdomain $draid_vdevs1 +log_must poolexists $TESTPOOL +log_must test "$(get_vdev_prop failure_domain $TESTPOOL draid1:1d:2c:32w-0)" == "-" +log_must destroy_pool $TESTPOOL + +# Width matches vdevs, but it must be multiple of children +log_mustnot zpool create $TESTPOOL draid2:13d:15c:32w $draid_vdevs + +log_pass "'zpool create draid:#d:#c:#w:#s '" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index f59104e19805..79992227169e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -72,6 +72,8 @@ typeset -a properties=( io_n io_t slow_io_events + failure_domain + failure_group slow_io_n slow_io_t trim_support diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 99a4556f70d5..63b674a95f21 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -92,6 +92,7 @@ typeset -a properties=( "feature@log_spacemap" "feature@device_rebuild" "feature@draid" + "feature@draid_failure_domains" "feature@redaction_list_spill" "feature@dynamic_gang_header" "feature@physical_rewrite" diff --git a/tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh b/tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh new file mode 100755 index 000000000000..fe2dfc14d125 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh @@ -0,0 +1,163 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# Copyright (c) 2026, Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/blkdev.shlib + +# +# DESCRIPTION: Verify that 4 disks removed from a draid3 with failure +# groups, when they are removed from a group, will suspend the pool. +# +# STRATEGY: +# 1. Disable ZED -- this test is focused on vdev_probe errors. +# 2. Create a draid3 pool whith random number of failure groups, from 2 to 6, +# where 4 disks can be removed (i.e., using scsi_debug). +# 3. Add some data to it for a resilver workload. +# 4. Replace one of the child vdevs to start a replacing vdev. +# 5. During the resilver, remove 4 disks, including one from the replacing vdev, +# from a failure group. +# 6. Verify that the pool is suspended. +# + +DEV_SIZE_MB=1024 + +DRAID_FGRP_CNT=$(random_int_between 2 6) +FILE_VDEV_CNT=$((8 * $DRAID_FGRP_CNT)) +DRAID="draid3:8c:${FILE_VDEV_CNT}w" +FILE_VDEV_SIZ=256M + +function cleanup +{ + destroy_pool $TESTPOOL + if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then + log_must eval "echo running > /sys/block/$sd/device/state" + fi + unload_scsi_debug + rm -f $DATA_FILE + for i in {0..$((FILE_VDEV_CNT - 1))}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + zed_start +} + +log_onexit cleanup + +log_assert "dRAID vdev with failure groups probe errors for more disks than" \ + "parity in a group should suspend a pool" + +log_note "Stoping ZED process" +zed_stop +zpool events -c + +# Make a debug device that we can "unplug" and lose 4 drives at once +unload_scsi_debug +load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b' +sd=$(get_debug_device) + +# Create 4 partitions that match the FILE_VDEV_SIZ +parted "/dev/${sd}" --script mklabel gpt +parted "/dev/${sd}" --script mkpart primary 0% 25% +parted "/dev/${sd}" --script mkpart primary 25% 50% +parted "/dev/${sd}" --script mkpart primary 50% 75% +parted "/dev/${sd}" --script mkpart primary 75% 100% +block_device_wait "/dev/${sd}" +blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4" + +# Create file vdevs +typeset -a filedevs +for i in {0..$((FILE_VDEV_CNT - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s $FILE_VDEV_SIZ $device + # Use all but the last one for pool create + if [[ $i -lt $((FILE_VDEV_CNT - 4)) ]]; then + filedevs[${#filedevs[*]}+1]=$device + fi +done + +# Create a draid3 pool that we can pull 4 disks from +log_must zpool create -f $TESTPOOL $DRAID ${filedevs[@]} $blkdevs +sync_pool $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" +SECONDS=0 +log_must fill_fs $MNTPOINT 1 200 4096 10 R +log_note "fill_fs took $SECONDS seconds" +sync_pool $TESTPOOL + +# Start a replacing vdev, but suspend the resilver +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-$((FILE_VDEV_CNT - 1)) + +# Remove 4 disks all at once +log_must eval "echo offline > /sys/block/${sd}/device/state" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + +# Add some writes to drive the vdev probe errors +log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1 + +# Wait until sync starts, and the pool suspends +log_note "waiting for pool to suspend" +typeset -i tries=30 +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + zpool status -s + log_fail "UNEXPECTED -- pool did not suspend" + fi + sleep 1 +done +log_note $(kstat_pool $TESTPOOL state) + +# Put the missing disks back into service +log_must eval "echo running > /sys/block/$sd/device/state" + +# Clear the vdev error states, which will reopen the vdevs and resume the pool +log_must zpool clear $TESTPOOL + +# Wait until the pool resumes +log_note "waiting for pool to resume" +tries=30 +until [[ $(kstat_pool $TESTPOOL state) != "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool did not resume" + fi + sleep 1 +done +log_must zpool wait -t resilver $TESTPOOL +sync_pool $TESTPOOL + +# Make sure a pool scrub comes back clean +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_pass "dRAID vdev with failure groups probe errors for more disks than" \ + "parity in a group should suspend a pool" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 65435554bdbe..53e2efffac2d 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -123,7 +123,7 @@ function setup_test_env log_note "Filling up the filesystem ..." typeset -i i=0 typeset file=$TESTDIR/file - typeset -i limit + typeset -li limit (( limit = $(get_prop available $pool) / 2 )) while true ; do @@ -206,15 +206,17 @@ function is_data_valid # # $1 pool name # $2 devices count +# $3 starting device index (optional, counts from 0) # -function get_vdevs #pool cnt +function get_vdevs #pool cnt off { typeset pool=$1 typeset -i cnt=$2 + typeset -i off=$3 typeset all_devs=$(zpool iostat -v $pool | awk '{print $1}' | \ grep -vEe "^pool$|^capacity$|^mirror\-[0-9]$|^raidz[1-3]\-[0-9]$|^draid[1-3].*\-[0-9]$|---" \ - -e "/old$|^$pool$") + -e "/old$|^$pool$" | tail -n +"$((off + 1))") typeset -i i=0 typeset vdevs while ((i < cnt)); do @@ -282,6 +284,43 @@ function damage_devs sync_pool $pool } +# +# Damage the pool's virtual device files starting from i-th one. +# +# $1 pool name +# $2 failing devices count +# $3 starting from which device (counts from 0) +# $3 damage vdevs method, if not null, we keep +# the label for the vdevs +# +function damage_devs_off +{ + typeset pool=$1 + typeset -i cnt=$2 + typeset -i off=$3 + typeset label="$4" + typeset vdevs + typeset -i bs_count=$(((MINVDEVSIZE / 1024) - 4096)) + + vdevs=$(get_vdevs $pool $cnt $off) + typeset dev + if [[ -n $label ]]; then + for dev in $vdevs; do + log_note "damage $dev (keeping label)" + log_must dd if=/dev/zero of=$dev seek=512 bs=1024 \ + count=$bs_count conv=notrunc >/dev/null 2>&1 + done + else + for dev in $vdevs; do + log_note "damage $dev" + log_must dd if=/dev/zero of=$dev bs=1024 \ + count=$bs_count conv=notrunc >/dev/null 2>&1 + done + fi + + sync_pool $pool +} + # # Clear errors in the pool caused by data corruptions # @@ -323,6 +362,26 @@ function remove_devs sync_pool $pool } +# +# Remove the specified pool's virtual device files starting from i-th one +# +# $1 Pool name +# $2 Missing devices count +# +function remove_devs_off +{ + typeset pool=$1 + typeset -i cnt=$2 + typeset -i off=$3 + typeset vdevs + + vdevs=$(get_vdevs $pool $cnt $off) + log_note "remove $vdevs" + log_must rm -f $vdevs + + sync_pool $pool +} + # # Recover the bad or missing device files in the pool # diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh new file mode 100755 index 000000000000..0f491ebda3c7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh @@ -0,0 +1,150 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 by Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify resilver to dRAID distributed spares. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can: +# - sustain N failures (1-3) * n, and +# - has N * n distributed spares to replace all faulted vdevs +# - n is the number of fail groups in the dRAID +# - failures in the groups happen at the same time +# b. Fill the pool with data +# c. Systematically fault a vdev, then replace it with a spare +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify resilver to dRAID distributed spares" + +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=$(random_int_between 1 3) + spares=$(random_int_between 1 $parity) + data=$(random_int_between 1 8) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 16) + n=$(random_int_between 2 4) + (( width = children * n )) + off=$(random_int_between 0 $((children - parity - 1))) + + (( spares *= n )) + + draid="draid${parity}:${data}d:${children}c:${width}w:${spares}s" + + setup_test_env $TESTPOOL $draid $width + + for (( i=0; i < $spares; i+=$n )); do + + for (( j=$i; j < $((i+n)); j++ )); do + fault_vdev="$BASEDIR/vdev$((i / n + (j % n) * children + off))" + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + done + + for (( j=$i; j < $((i+n)); j++ )); do + fault_vdev="$BASEDIR/vdev$((i / n + (j % n) * children + off))" + spare_vdev="draid${parity}-0-${j}" + log_must zpool replace -w $flags $TESTPOOL \ + $fault_vdev $spare_vdev + done + + for (( j=$i; j < $((i+n)); j++ )); do + fault_vdev="$BASEDIR/vdev$((i / n + (j % n) * children + off))" + spare_vdev="draid${parity}-0-${j}" + log_must check_vdev_state spare-$j "DEGRADED" + log_must check_vdev_state $spare_vdev "ONLINE" + log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" + log_must zpool detach $TESTPOOL $fault_vdev + done + + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + done + + # Fail remaining drives as long as parity permits. + faults_left=$parity + off=0 + for (( failed=$((spares/n)); failed < $parity; failed++ )); do + # we can still fail disks + (( ++off )) + for (( i=0; i < $n; i++ )); do + fault_vdev="$BASEDIR/vdev$((i * children + children - 1 - off))" + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + (( faults_left > 0 && faults_left-- )) + done + done + + # Make sure that faults_left failures are still allowed, but no more. + for (( i=0; i < $n; i++ )); do + fault_vdev="$BASEDIR/vdev$((i * children + children - 1))" + log_must zpool offline -f $TESTPOOL $fault_vdev + if (( $i < $faults_left)); then + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + else + log_must check_vdev_state $TESTPOOL $fault_vdev "DEGRADED" + break + fi + + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + done + + log_must is_data_valid $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh new file mode 100755 index 000000000000..e043eb5cfcec --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 by Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid vdev with n failure grups can withstand n devices failing +# or missing, each device being i-th one in each group. +# +# STRATEGY: +# 1. Create N(>3,<6) * n virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damage any n virtual disk files with the same offset in each group. +# 6. Verify the data is correct. +# + +verify_runnable "global" + +log_assert "Verify draid pool with n failure groups can withstand n i-th" \ + "devices failing in each group." +log_onexit cleanup + +typeset -i children=$(random_int_between 3 6) +typeset -i fgroups=$(random_int_between 2 4) +typeset -i ith=$(random_int_between 0 $((children - 1))) +typeset -i width=$((children * fgroups)) +setup_test_env $TESTPOOL draid:${children}c:${width}w $width + +# +# Inject data corruption errors for draid pool +# +for (( i=0; i<$fgroups; i=i+1 )); do + damage_devs_off $TESTPOOL 1 "$((ith + children*i))" "label" +done +log_must is_data_valid $TESTPOOL +log_must clear_errors $TESTPOOL + +# +# Inject bad device errors for draid pool +# +for (( i=0; i<$fgroups; i=i+1 )); do + damage_devs_off $TESTPOOL 1 "$((ith + children*i))" +done +log_must is_data_valid $TESTPOOL +log_must recover_bad_missing_devs $TESTPOOL 1 + +# +# Inject missing device errors for draid pool +# +for (( i=0; i<$fgroups; i=i+1 )); do + remove_devs_off $TESTPOOL 1 "$((ith + children*i))" +done +log_must is_data_valid $TESTPOOL + +log_pass "draid:${children}c:${width}w pool can withstand $fgroups i-th" \ + "devices failing passed."