From 2a36569929d825fa3a16c28934c6a3cedc52dae9 Mon Sep 17 00:00:00 2001 From: Andriy Tkachuk Date: Tue, 24 Feb 2026 15:30:24 +0000 Subject: [PATCH] draid: add failure domains support Currently, the only way to tolerate the failure of the whole enclosure is to configure several draid vdevs in the pool, each vdev having disks from different enclosures. But this essentially degrades draid to raidz and defeats the purpose of having fast sequential resilvering on wide pools with draid. This patch allows to configure several children groups in the same row in one draid vdev. In each such group, let's call it failure group, the user can configure disks belonging to different enclosures - failure domains. For example, in case of 10 enclosures with 10 disks each, the user can put 1st disk from each enclosure into 1st group, 2nd disk from each enclosure into 2nd group, and so on. If one enclosure fails, only one disk from each group would fail, which won't affect draid operation, and each group would have enough redundancy to recover the stored data. Of course, in case of draid2 - two enclosures can fail at a time, in case of draid3 - three enclosures (provided there are no other disk failures in each group). In order to preserve fast sequential resilvering in case of a disk failure, the groups much share all disks between themselves, and this is achieved by shuffling the disks between the groups. But only i-th disks in each group are shuffled between themselves, i.e. the disks from the same enclosures, after that they are shuffled within each group, like it is done today in an ordinary draid. Thus, no more than one disk from any enclosure can appear in any failure group as a result of this shuffling. For example, here's how the pool status output looks like in case of two `draid1:2d:4c:1s` groups: NAME STATE READ WRITE CKSUM pool1 ONLINE 0 0 0 draid1:2d:4c:1s:8w-0 ONLINE 0 0 0 enc0d0 ONLINE 0 0 0 enc1d0 ONLINE 0 0 0 enc2d0 ONLINE 0 0 0 enc3d0 ONLINE 0 0 0 enc0d1 ONLINE 0 0 0 enc1d1 ONLINE 0 0 0 enc2d1 ONLINE 0 0 0 enc3d1 ONLINE 0 0 0 spares draid1-0-0 AVAIL draid1-0-1 AVAIL The number of failure groups is specified indirectly via the new width parameter in draid vdev configuration descriptor, which is the total number of disks and which is multiple of children in each group. This multiple is the number of groups (width / children). Doing it this way allows the user conveniently see how many disks draid has in an instant. Spare disks are evenly distributed among failure groups, so the number of spares should be multiple of the number of groups, and they are shared by all groups. However, to support domain failure, we cannot have more than nparity - 1 failed disks in any group, no matter if they are rebuilt to draid spares or not (the blocks of those spares can be mapped to the disks from the failed domain (enclosure), and we cannot tolerate more than nparity failures in any failure group). The retire agent in zed is updated to not start resilvering when the domain failure happens. Otherwise, it might take a lot of computing and I/O bandwidth resources, only to be wasted when the failed domain component is replaced. Signed-off-by: Andriy Tkachuk Closes #11969. --- cmd/zed/agents/zfs_diagnosis.c | 2 +- cmd/zed/agents/zfs_retire.c | 139 ++++++++- cmd/zpool/zpool_main.c | 21 +- cmd/zpool/zpool_vdev.c | 201 ++++++++++-- include/libzfs.h | 1 + include/sys/fs/zfs.h | 3 + include/sys/vdev_draid.h | 7 +- include/zfeature_common.h | 1 + lib/libzfs/libzfs.abi | 40 +-- lib/libzfs/libzfs_pool.c | 28 +- lib/libzfs/libzfs_status.c | 43 ++- man/man7/vdevprops.7 | 16 +- man/man7/zpool-features.7 | 26 ++ man/man7/zpoolconcepts.7 | 40 ++- man/man8/zpool-create.8 | 35 +++ module/zcommon/zfeature_common.c | 13 + module/zcommon/zpool_prop.c | 6 + module/zfs/spa.c | 48 ++- module/zfs/vdev.c | 68 +++- module/zfs/vdev_draid.c | 291 ++++++++++++++++-- module/zfs/vdev_raidz.c | 21 +- tests/runfiles/common.run | 6 +- tests/runfiles/linux.run | 3 +- tests/zfs-tests/tests/Makefile.am | 4 + .../zpool_create_draid_005_pos.ksh | 149 +++++++++ .../cli_root/zpool_get/vdev_get.cfg | 2 + .../cli_root/zpool_get/zpool_get.cfg | 1 + .../fault/suspend_draid_fgroups.ksh | 163 ++++++++++ .../functional/redundancy/redundancy.kshlib | 65 +++- .../redundancy/redundancy_draid_spare4.ksh | 150 +++++++++ .../redundancy/redundancy_draid_width.ksh | 91 ++++++ 31 files changed, 1557 insertions(+), 127 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index 206caa16baa6..8058f17ce560 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -726,7 +726,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) */ if (isresource) { zfs_stats.resource_drops.fmds_value.ui64++; - fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + fmd_hdl_debug(hdl, "discarding '%s' for vdev %llu", class, vdev_guid); return; } diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index d68272bea731..40ad346a8624 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -100,12 +100,16 @@ find_pool(zpool_handle_t *zhp, void *data) * Find a vdev within a tree with a matching GUID. */ static nvlist_t * -find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid, + uint64_t *parent_guid) { - uint64_t guid; + uint64_t guid, saved_parent_guid; nvlist_t **child; uint_t c, children; - nvlist_t *ret; + nvlist_t *ret = NULL; + + if (parent_guid != NULL) + saved_parent_guid = *parent_guid; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && guid == search_guid) { @@ -119,8 +123,9 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) return (NULL); for (c = 0; c < children; c++) { - if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) - return (ret); + if ((ret = find_vdev(zhdl, child[c], search_guid, + parent_guid)) != NULL) + goto out; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, @@ -128,8 +133,9 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) return (NULL); for (c = 0; c < children; c++) { - if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) - return (ret); + if ((ret = find_vdev(zhdl, child[c], search_guid, + parent_guid)) != NULL) + goto out; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, @@ -137,11 +143,18 @@ find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) return (NULL); for (c = 0; c < children; c++) { - if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) - return (ret); + if ((ret = find_vdev(zhdl, child[c], search_guid, + parent_guid)) != NULL) + goto out; } return (NULL); +out: + /* If parent_guid was set, don't reset it. */ + if (ret != NULL && parent_guid != NULL && + saved_parent_guid == *parent_guid) + *parent_guid = guid; + return (ret); } static int @@ -203,11 +216,12 @@ find_and_remove_spares(libzfs_handle_t *zhdl, uint64_t vdev_guid) } /* - * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + * Given a (pool, vdev) GUID pair, find the matching pool, vdev and + * its top_guid. */ static zpool_handle_t * -find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, - nvlist_t **vdevp) +find_by_guid_impl(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp, uint64_t *top_guid) { find_cbdata_t cb; zpool_handle_t *zhp; @@ -229,7 +243,8 @@ find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, } if (vdev_guid != 0) { - if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { + if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid, + top_guid)) == NULL) { zpool_close(zhp); return (NULL); } @@ -238,6 +253,96 @@ find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, return (zhp); } +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + return (find_by_guid_impl(zhdl, pool_guid, vdev_guid, vdevp, NULL)); +} + +/* + * Given a (pool, vdev) GUID pair, count the number of faulted vdevs in + * its top vdev and return TRUE if the number of failures at i-th device + * index in each dRAID failure group, equals to the number of failure groups, + * which means it's the domain failure, and the vdev is one of those faults. + * Otherwise, return FALSE. + */ +static boolean_t +is_draid_fdomain_failure(libzfs_handle_t *zhdl, uint64_t pool_guid, + uint64_t vdev_guid) +{ + uint64_t guid, top_guid; + uint64_t children; + nvlist_t *nvtop, *vdev, **child; + vdev_stat_t *vs; + uint_t i, c, vdev_i = UINT_MAX, width, *nfaults_map; + + if (find_by_guid_impl(zhdl, pool_guid, vdev_guid, &vdev, + &top_guid) == NULL) + return (B_FALSE); + + if (find_by_guid_impl(zhdl, pool_guid, top_guid, &nvtop, + NULL) == NULL) + return (B_FALSE); + + if (nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, + &child, &width) != 0) + return (B_FALSE); + + if (nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_DRAID_NCHILDREN, + &children) != 0) /* not dRAID */ + return (B_FALSE); + + if (width == children) /* dRAID without failure domains */ + return (B_FALSE); + + /* + * No rush with starting resilver, it can be domain failure, + * in which case we need to wait a little to allow more devices + * to get into faulted state so that we could detect that + * it's the domain failure indeed. + */ + sleep(5); + + nfaults_map = calloc(children, sizeof (*nfaults_map)); + if (nfaults_map == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + for (c = 0; c < width; c++) { + nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &i); + + if (vs->vs_state == VDEV_STATE_FAULTED) + nfaults_map[c % children]++; + + if (vs->vs_state == VDEV_STATE_FAULTED && + nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &guid) == 0 && guid == vdev_guid) + vdev_i = (c % children); + } + + boolean_t res = B_FALSE; + for (c = 0; c < children; c++) { + if (c == vdev_i && nfaults_map[c] == (width / children)) { + res = B_TRUE; + break; + } + } + + free(nfaults_map); + + if (res) + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "vdev %llu belongs to draid fdomain failure", vdev_guid); + + return (res); +} + /* * Given a vdev, attempt to replace it with every known spare until one * succeeds or we run out of devices to try. @@ -445,6 +550,14 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, if (vs->vs_state == VDEV_STATE_OFFLINE) return; + /* + * Resilvering domain failures can take a lot of computing and + * I/O bandwidth resources, only to be wasted when the failed + * domain component (for example enclosure) is replaced. + */ + if (is_draid_fdomain_failure(zhdl, pool_guid, vdev_guid)) + return; + /* * If state removed is requested for already removed vdev, * its a loopback event from spa_async_remove(). Just diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 265d7488dd8a..0bf33de8de66 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3528,6 +3528,11 @@ show_import(nvlist_t *config, boolean_t report_error) "accessed by another system.\n")); break; + case ZPOOL_STATUS_FAULTED_FDOM_R: + (void) printf_color(ANSI_YELLOW, gettext("One or more failure " + " domains are faulted.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf_color(ANSI_YELLOW, gettext("One or more devices " @@ -8039,7 +8044,7 @@ zpool_do_online(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("failed to open pool " - "\"%s\""), poolname); + "\"%s\"\n"), poolname); return (1); } @@ -8183,7 +8188,7 @@ zpool_do_offline(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("failed to open pool " - "\"%s\""), poolname); + "\"%s\"\n"), poolname); return (1); } @@ -10725,6 +10730,18 @@ print_status_reason(zpool_handle_t *zhp, status_cbdata_t *cbp, "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; + case ZPOOL_STATUS_FAULTED_FDOM_R: + (void) snprintf(status, ST_SIZE, + gettext("One or more failure domains are faulted. " + "The storage devices may be\n\tintact. Sufficient " + "replicas exist for the pool to continue functioning\n\t" + "in a degraded state.\n")); + (void) snprintf(action, AC_SIZE, + gettext("Replace the faulted domain device, " + "or use 'zpool clear' to mark domain\n\tstorage devices " + "repaired.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_NR: (void) snprintf(status, ST_SIZE, gettext("One or more devices are " diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index d1e9ef76dc10..fe9f574ab7bf 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -1323,36 +1323,44 @@ is_grouping(const char *type, int *mindev, int *maxdev) * Extract the configuration parameters encoded in the dRAID type and * use them to generate a dRAID configuration. The expected format is: * - * draid[][:][:][:] + * draid[][:d][:c][:s][:w] * * The intent is to be able to generate a good configuration when no * additional information is provided. The only mandatory component * of the 'type' is the 'draid' prefix. If a value is not provided * then reasonable defaults are used. The optional components may - * appear in any order but the d/s/c suffix is required. + * appear in any order but the d/s/c/w suffix is required. * * Valid inputs: * - data: number of data devices per group (1-255) - * - parity: number of parity blocks per group (1-3) - * - spares: number of distributed spare (0-100) - * - children: total number of devices (1-255) + * - parity: number of parity devices per group (1-3) + * - children: total number of devices in slice (1-255) + * - width: total number of devices, multiple of children (1-255 for now) + * - spares: number of distributed spare devices (0-100), must be + * multiple of failure groups (width / children) * * Examples: * - zpool create tank draid * - zpool create tank draid2:8d:51c:2s + * - zpool create tank draid2:8d:12c:96w:8s */ static int -draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t width, + int nfgroup, int nfdomain) { uint64_t nparity; uint64_t nspares = 0; uint64_t ndata = UINT64_MAX; uint64_t ngroups = 1; + uint64_t children = 0; long value; if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) return (EINVAL); + if (nfgroup && nfdomain) /* must be only one of two or none */ + return (EINVAL); + nparity = (uint64_t)get_parity(type); if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { fprintf(stderr, @@ -1376,24 +1384,35 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) return (EINVAL); } - /* Expected non-zero value with c/d/s suffix */ + /* Expected non-zero value with c/d/s/w suffix */ value = strtol(p, &end, 10); char suffix = tolower(*end); if (errno != 0 || - (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (suffix != 'c' && suffix != 'd' && suffix != 's' && + suffix != 'w')) { (void) fprintf(stderr, gettext("invalid dRAID " - "syntax; expected [:] not '%s'\n"), - type); + "syntax; expected [:], " + "not '%s'\n"), type); return (EINVAL); } if (suffix == 'c') { - if ((uint64_t)value != children) { + if ((uint64_t)value > width || + width % (uint64_t)value != 0) { fprintf(stderr, - gettext("invalid number of dRAID children; " + gettext("invalid number of dRAID disks; " + "multiple of %llu required but %llu " + "provided\n"), (u_longlong_t)value, + (u_longlong_t)width); + return (EINVAL); + } + children = value; + } else if (suffix == 'w') { + if ((uint64_t)value != width) { + fprintf(stderr, + gettext("invalid number of dRAID disks; " "%llu required but %llu provided\n"), - (u_longlong_t)value, - (u_longlong_t)children); + (u_longlong_t)value, (u_longlong_t)width); return (EINVAL); } } else if (suffix == 'd') { @@ -1405,6 +1424,42 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) } } + if (!children && nfgroup) + children = width / nfgroup; + if (!children && nfdomain) + children = nfdomain; + if (!children) + children = width; + + int fgrps = width / children; + + if ((nspares % fgrps) != 0) { + fprintf(stderr, gettext("invalid number of distributed spares " + "%llu, must be multiple of failure groups %d\n"), + (u_longlong_t)nspares, fgrps); + return (EINVAL); + } + + if (fgrps == 1 && (nfgroup || nfdomain)) { + fprintf(stderr, gettext("failure domains are not set " + "in dRAID vdev descriptor\n")); + return (EINVAL); + } + + if (fgrps > 1 && nfgroup && fgrps != nfgroup) { + fprintf(stderr, gettext("invalid number of failure groups " + "%d, must be %d\n"), nfgroup, fgrps); + return (EINVAL); + } + + if (fgrps > 1 && nfdomain && nfdomain != children) { + fprintf(stderr, gettext("invalid number of failure domains " + "%d, must be %lu\n"), nfdomain, children); + return (EINVAL); + } + + nspares /= fgrps; + /* * When a specific number of data disks is not provided limit a * redundancy group to 8 data disks. This value was selected to @@ -1414,8 +1469,8 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) if (children > nspares + nparity) { ndata = MIN(children - nspares - nparity, 8); } else { - fprintf(stderr, gettext("request number of " - "distributed spares %llu and parity level %llu\n" + fprintf(stderr, gettext("requested number of " + "distributed spares %llu and parity level %llu " "leaves no disks available for data\n"), (u_longlong_t)nspares, (u_longlong_t)nparity); return (EINVAL); @@ -1450,7 +1505,7 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) (u_longlong_t)(ndata + nparity + nspares)); } - if (children > VDEV_DRAID_MAX_CHILDREN) { + if (width > VDEV_DRAID_MAX_CHILDREN) { fprintf(stderr, gettext("%llu disks were provided, but " "dRAID only supports up to %u disks"), (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); @@ -1467,8 +1522,9 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) /* Store the basic dRAID configuration. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares * fgrps); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, children); return (0); } @@ -1606,10 +1662,41 @@ construct_spec(nvlist_t *props, int argc, char **argv) nlogs++; } + int nfdomain = 0, nfgroup = 0; + int fdndev = 0, fgndev = 0; + int fdndev_prev = 0, fgndev_prev = 0; + for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + if (strcmp(argv[c], "fgroup") == 0 || + strcmp(argv[c], "failure_group") == 0) { + if (fgndev_prev && + fgndev_prev != fgndev) + break; + fgndev_prev = fgndev; + fgndev = 0; + nfgroup++; + continue; + } + + if (strcmp(argv[c], "fdomain") == 0 || + strcmp(argv[c], "failure_domain") == 0) { + if (fdndev_prev && + fdndev_prev != fdndev) + break; + fdndev_prev = fdndev; + fdndev = 0; + nfdomain++; + continue; + } + + if (nfgroup) + fgndev++; + if (nfdomain) + fdndev++; + children++; child = realloc(child, children * sizeof (nvlist_t *)); @@ -1647,6 +1734,81 @@ construct_spec(nvlist_t *props, int argc, char **argv) goto spec_out; } + if ((nfdomain || nfgroup) && + strcmp(type, VDEV_TYPE_DRAID) != 0) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s is not dRAID and cannot " + "have failure domains\n"), argv[0]); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (nfgroup && nfdomain) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has mixed configuration " + "of %d failure groups and %d failure " + "domains, it must have either fgroups or " + "fdomains, not both\n"), argv[0], + nfgroup, nfdomain); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (nfgroup == 1 || nfdomain == 1) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has only one failure %s " + "configured, it must be more than one\n"), + argv[0], nfgroup ? "group" : "domain"); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (fgndev_prev != fgndev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has different number of " + "devices in failure group %d than in " + "previous group: %d != %d\n"), argv[0], + nfgroup, fgndev, fgndev_prev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (fdndev_prev != fdndev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s has different number of " + "devices in failure domain %d than in " + "previous domain: %d != %d\n"), argv[0], + nfdomain, fdndev, fdndev_prev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (nfdomain) { + /* Put children in the right order */ + nvlist_t **ch = NULL; + ch = realloc(ch, + children * sizeof (nvlist_t *)); + if (ch == NULL) + zpool_no_memory(); + int dlen = children / nfdomain; + int i = 0; + for (int g = 0; g < dlen; g++) + for (int d = 0; d < nfdomain; d++) + ch[i++] = child[g + (d * dlen)]; + free(child); + child = ch; + } + argc -= c; argv += c; @@ -1692,7 +1854,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) } if (strcmp(type, VDEV_TYPE_DRAID) == 0) { if (draid_config_by_type(nv, - fulltype, children) != 0) { + fulltype, children, nfgroup, + nfdomain) != 0) { for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); diff --git a/include/libzfs.h b/include/libzfs.h index 0ff3948e117b..f3bef7af62d5 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -443,6 +443,7 @@ typedef enum { * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ + ZPOOL_STATUS_FAULTED_FDOM_R, /* faulted fdomain with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index de2149641d21..9f3ff814c5ee 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -392,6 +392,8 @@ typedef enum { VDEV_PROP_AUTOSIT, VDEV_PROP_SLOW_IO_EVENTS, VDEV_PROP_SCHEDULER, + VDEV_PROP_FDOMAIN, + VDEV_PROP_FGROUP, VDEV_NUM_PROPS } vdev_prop_t; @@ -926,6 +928,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" +#define ZPOOL_CONFIG_DRAID_NCHILDREN "draid_nchildren" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h index e923092a39ad..e51a1a59f00b 100644 --- a/include/sys/vdev_draid.h +++ b/include/sys/vdev_draid.h @@ -68,9 +68,10 @@ typedef struct vdev_draid_config { */ uint64_t vdc_ndata; /* # of data devices in group */ uint64_t vdc_nparity; /* # of parity devices in group */ - uint64_t vdc_nspares; /* # of distributed spares */ + uint64_t vdc_nspares; /* # of distributed spares in slice */ uint64_t vdc_children; /* # of children */ uint64_t vdc_ngroups; /* # groups per slice */ + uint64_t vdc_width; /* # multiple of children */ /* * Immutable derived constants. @@ -103,7 +104,9 @@ extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); /* Functions for dRAID distributed spares. */ extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); -extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); +extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t *, + uint64_t); +extern boolean_t vdev_draid_fail_domain_allowed(vdev_t *); #ifdef __cplusplus } diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 56382ca85b55..64606de226b0 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -90,6 +90,7 @@ typedef enum spa_feature { SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURE_BLOCK_CLONING_ENDIAN, SPA_FEATURE_PHYSICAL_REWRITE, + SPA_FEATURE_DRAID_FAIL_DOMAINS, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 876433c0ba58..9ce5d719cdfa 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -690,7 +690,7 @@ - + @@ -6175,18 +6175,19 @@ - - - - - - - - - - - - + + + + + + + + + + + + + @@ -6258,7 +6259,9 @@ - + + + @@ -6542,7 +6545,8 @@ - + + @@ -9909,8 +9913,8 @@ - - + + @@ -9971,7 +9975,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index e12308b01ab1..66b6f4fe448a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1453,11 +1453,17 @@ zpool_has_draid_vdev(nvlist_t *nvroot) */ static char * zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, - uint64_t spares, uint64_t children) + uint64_t spares, uint64_t children, uint64_t width) { - snprintf(name, len, "%s%llu:%llud:%lluc:%llus", - VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, - (u_longlong_t)children, (u_longlong_t)spares); + if (children < width) + snprintf(name, len, "%s%llu:%llud:%lluc:%lluw:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)width, + (u_longlong_t)spares); + else + snprintf(name, len, "%s%llu:%llud:%lluc:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)spares); return (name); } @@ -4584,12 +4590,12 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, * If it's a dRAID device, we add parity, groups, and spares. */ if (strcmp(path, VDEV_TYPE_DRAID) == 0) { - uint64_t ndata, nparity, nspares; + uint64_t ndata, nparity, nspares, children; nvlist_t **child; - uint_t children; + uint_t width; verify(nvlist_lookup_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + ZPOOL_CONFIG_CHILDREN, &child, &width) == 0); nparity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); ndata = fnvlist_lookup_uint64(nv, @@ -4597,8 +4603,12 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, nspares = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES); + if (nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NCHILDREN, &children) != 0) + children = width; + path = zpool_draid_name(buf, sizeof (buf), ndata, - nparity, nspares, children); + nparity, nspares, children, width); } /* @@ -5522,6 +5532,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: + case VDEV_PROP_FDOMAIN: + case VDEV_PROP_FGROUP: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index a589ca6896f0..d39172f45008 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -154,8 +154,12 @@ vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc, void *arg) /* * Detect if any leaf devices that have seen errors or could not be opened. + * Returns: + * - EDOM if a failure domain in dRAID vdev is down + * - ENXIO if any device is problematic + * - 0 (zero) otherwise */ -static boolean_t +static int find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), void *arg, boolean_t ignore_replacing) { @@ -172,22 +176,41 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), const char *type = fnvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); + return (0); } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { + + uint64_t fgrp_children = 0; + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_DRAID_NCHILDREN, + &fgrp_children); + + for (c = 0; c < fgrp_children; c++) { + int nfgrps = children / fgrp_children; + int nfaults = 0; + for (int g = 0; g < nfgrps; g++) { + if (find_vdev_problem(child[c + + (g * fgrp_children)], func, arg, + ignore_replacing)) + nfaults++; + } + if (nfaults == nfgrps) + return (EDOM); + } + for (c = 0; c < children; c++) { - if (find_vdev_problem(child[c], func, arg, - ignore_replacing)) - return (B_TRUE); + int res; + if ((res = find_vdev_problem(child[c], func, arg, + ignore_replacing))) + return (res); } } else { uint_t vsc; vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array( vdev, ZPOOL_CONFIG_VDEV_STATS, &vsc); if (func(vs, vsc, arg) != 0) - return (B_TRUE); + return (ENXIO); } /* @@ -198,11 +221,11 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), for (c = 0; c < children; c++) { if (find_vdev_problem(child[c], func, arg, ignore_replacing)) - return (B_TRUE); + return (ENXIO); } } - return (B_FALSE); + return (0); } /* @@ -406,6 +429,10 @@ check_status(nvlist_t *config, boolean_t isimport, /* * Missing devices in a replicated config. */ + if (find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE) == EDOM) + return (ZPOOL_STATUS_FAULTED_FDOM_R); + if (find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE) == EDOM) + return (ZPOOL_STATUS_FAULTED_FDOM_R); if (find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); if (find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE)) diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 3b65a52ae630..02ad1df48230 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -64,7 +64,7 @@ The values of non-numeric properties are case sensitive and must be lowercase. The following native properties consist of read-only statistics about the vdev. These properties can not be changed. -.Bl -tag -width "fragmentation" +.Bl -tag -width "failure_domain" .It Sy capacity Percentage of vdev space used .It Sy state @@ -89,6 +89,14 @@ How much this vdev can expand by Percent of fragmentation in this vdev .It Sy parity The level of parity for this vdev +.It Sy failure_domain +Failure domain id of this child vdev in +.Sy dRAID +vdev with failure domains feature +.It Sy failure_group +Failure group id of this child vdev in +.Sy dRAID +vdev with failure domains feature .It Sy devid The device id for this vdev .It Sy physpath @@ -114,7 +122,7 @@ threshold in milliseconds For .Sy RAIDZ and -.Sy DRAID +.Sy dRAID configurations, this value also represents the number of times the vdev was identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops @@ -166,7 +174,7 @@ failfast. Only valid for .Sy RAIDZ and -.Sy DRAID +.Sy dRAID vdevs. True when a slow disk outlier was detected and the vdev is currently in a sit out state. @@ -180,7 +188,7 @@ data will be reconstructed as needed from parity. Only valid for .Sy RAIDZ and -.Sy DRAID +.Sy dRAID vdevs. If set, this enables the kernel-level slow disk detection logic. This logic automatically causes any vdevs that are significant negative diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b4404a6eb58d..aeedaaca5a25 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -504,6 +504,32 @@ vdev type, or when adding a new .Sy draid vdev to an existing pool. . +.feature com.seagate draid_failure_domains no draid +This feature enables use of failure domains in +.Sy draid +vdev type. +Failure domains allow for an entire set of devices that belong to a domain +to fail without taking the pool offline. +Devices that are likely to fail together due to sharing a common component, +such as an enclosure, HBA, or SAS expander, are good candidates to form a +failure domain. +For example, on a setup with several enclosures the user defines a failure +domain for each enclosure with all its devices and can arrange devices into +failure groups in such a way that every i-th device in every group belongs +to i-th enclosure. +This will allow tolerating the failure of the whole enclosure. +The size of the failure group is equal to the number of failure domains, +and it cannot be less than the size of the redundancy group +(parity + data + spares). +.Pp +This feature becomes +.Sy active +when creating a pool which uses the +.Sy draid +vdev type with failure domains configured, or when adding a new +.Sy draid +vdev with failure domains to an existing pool. +. .feature com.klarasystems dynamic_gang_header no This feature enables larger gang headers based on the sector size of the pool. When enabled, gang headers will use the entire space allocated for them, instead diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 07b78dda2396..ba7cd6399fa1 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -26,6 +26,7 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2026 Seagate Technology, LLC. .\" .Dd August 6, 2025 .Dt ZPOOLCONCEPTS 7 @@ -142,7 +143,7 @@ A dRAID with .No parity level, and Em S No distributed hot spares can hold approximately .Em (N-S)*(D/(D+P))*X No bytes and can withstand Em P devices failing without losing data. -.It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc +.It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar width Ns Sy w Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc A non-default dRAID configuration can be specified by appending one or more of the following optional arguments to the .Sy draid @@ -161,9 +162,34 @@ Defaults to The expected number of children. Useful as a cross-check when listing a large number of devices. An error is returned when the provided number of children differs. +.It Ar width +You can configure several groups of children in the same row, in which case +.Em width No would be a multiple of Em children . +Such configurations allow the creation of failure groups with every i-th device +in each group being from different failure domain (for example an enclosure) +so that if all devices in one domain fail, the +.Em draid No vdev still will be operational with enough redundancy to +rebuild the data. +In case of +.Em draid2 , No two domains can fail at a time, in case of +.Em draid3 No \(em three domains (provided there are no other failures +in any failure group). +For each group, it will be only one, two or three failures. .It Ar spares The number of distributed hot spares. +If failure domains are configured +.Em ( width No > Em children ) , No it must be a +multiple of the number of failure groups so that each group has the same +number of spares. +All spares are shared between failure groups. Defaults to zero. +.Pp +Note: to support domain failure, we cannot have more than +.Em parity-1 No failures in any failure group, no matter if the failed +devices are rebuilt to draid hot spares or not \(em the blocks of those +spares can be mapped to the devices from the failed domain, and we cannot +tolerate more than +.Em parity No failures in any failure group . .El .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. @@ -202,6 +228,10 @@ A cache device cannot be configured as a mirror or raidz group. For more information, see the .Sx Cache Devices section. +.It Sy fdomain No or Sy failure_domain +Denotes the list of failure domain devices for dRAID vdev. +.It Sy fgroup No or Sy failure_group +Denotes the list of failure group devices for dRAID vdev. .El .Pp Virtual devices cannot be nested arbitrarily. @@ -364,7 +394,13 @@ pools. The .Sy draid vdev type provides distributed hot spares. -These hot spares are named after the dRAID vdev they're a part of +These are virtual devices whose blocks are reserved and distributed among +all real devices, which makes resilvering to them much faster because one +device is not a bottleneck anymore. +Fast resilvering is crucial for data durability, it decreases the time of +having degraded data redundancy in the pool, thus decreasing the chance of +losing more devices at a time than we can tolerate. +dRAID hot spares are named after the draid vdev they're a part of .Po Sy draid1 Ns - Ns Ar 2 Ns - Ns Ar 3 No specifies spare Ar 3 No of vdev Ar 2 , .No which is a single parity dRAID Pc and may only be used by that dRAID vdev. diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 index a36ae260a158..d5696ad85f6b 100644 --- a/man/man8/zpool-create.8 +++ b/man/man8/zpool-create.8 @@ -239,6 +239,41 @@ The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf . +.Ss Example 7 : No Creating a ZFS Pool with dRAID vdev +The following command creates a ZFS storage pool with dRAID vdev +with one parity, four data and one spare devices, 6 devices in total: +.Dl # Nm zpool Cm create Ar pool Sy draid1:4d:6c:1s Pa sda sdb sdc sdd sde sdf +. +.Ss Example 8 : No Creating a ZFS Pool with dRAID vdev with failure domains +The following commands create a ZFS storage pool with dRAID vdev +with five failure groups and six failure domains (for example, enclosures). +The commands are equivalent: +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm create Ar pool Sy draid1:4d:6c:30w:5s No \e + \fIenc0d0 enc1d0 enc2d0 enc3d0 enc4d0 enc5d0\fP \e + \fIenc0d1 enc1d1 enc2d1 enc3d1 enc4d1 enc5d1\fP \e + \fIenc0d2 enc1d2 enc2d2 enc3d2 enc4d2 enc5d2\fP \e + \fIenc0d3 enc1d3 enc2d3 enc3d3 enc4d3 enc5d3\fP \e + \fIenc0d4 enc1d4 enc2d4 enc3d4 enc4d4 enc5d4\fP +.Ed +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm create Ar pool Sy draid1:5s No \e + \fBfgroup\fP \fIenc0d0 enc1d0 enc2d0 enc3d0 enc4d0 enc5d0\fP \e + \fBfgroup\fP \fIenc0d1 enc1d1 enc2d1 enc3d1 enc4d1 enc5d1\fP \e + \fBfgroup\fP \fIenc0d2 enc1d2 enc2d2 enc3d2 enc4d2 enc5d2\fP \e + \fBfgroup\fP \fIenc0d3 enc1d3 enc2d3 enc3d3 enc4d3 enc5d3\fP \e + \fBfgroup\fP \fIenc0d4 enc1d4 enc2d4 enc3d4 enc4d4 enc5d4\fP +.Ed +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm create Ar pool Sy draid1:5s No \e + \fBfdomain\fP \fIenc0d0 enc0d1 enc0d2 enc0d3 enc0d4\fP \e + \fBfdomain\fP \fIenc1d0 enc1d1 enc1d2 enc1d3 enc1d4\fP \e + \fBfdomain\fP \fIenc2d0 enc2d1 enc2d2 enc2d3 enc2d4\fP \e + \fBfdomain\fP \fIenc3d0 enc3d1 enc3d2 enc3d3 enc3d4\fP \e + \fBfdomain\fP \fIenc4d0 enc4d1 enc4d2 enc4d3 enc4d4\fP \e + \fBfdomain\fP \fIenc5d0 enc5d1 enc5d2 enc5d3 enc5d4\fP +.Ed +. .Sh SEE ALSO .Xr zpool-destroy 8 , .Xr zpool-export 8 , diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ba9892eeb64..2bb19c0cf5fd 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -696,6 +696,19 @@ zpool_feature_init(void) "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t draid_fdomain_deps[] = { + SPA_FEATURE_DRAID, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_DRAID_FAIL_DOMAINS, + "com.seagate:draid_failure_domains", + "draid_failure_domains", + "Support for failure domains in dRAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, + draid_fdomain_deps, sfeatures); + } + { static const spa_feature_t zilsaxattr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 2c6515e93676..78ee3f783ecb 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -392,6 +392,12 @@ vdev_prop_init(void) ZFS_TYPE_VDEV, "", "ASHIFT", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_PARITY, "parity", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "PARITY", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_FDOMAIN, "failure_domain", UINT64_MAX, + PROP_READONLY, ZFS_TYPE_VDEV, "", "FDOM", B_FALSE, + sfeatures); + zprop_register_number(VDEV_PROP_FGROUP, "failure_group", UINT64_MAX, + PROP_READONLY, ZFS_TYPE_VDEV, "", "FGRP", B_FALSE, + sfeatures); zprop_register_number(VDEV_PROP_NUMCHILDREN, "numchildren", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "NUMCHILD", B_FALSE, sfeatures); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 843b1b9d66bb..c4a691e47d93 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7028,10 +7028,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj, ndraid = 0; + uint64_t version, obj, ndraid = 0, draid_nfgroup = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; + boolean_t has_draid; + boolean_t has_draid_fdomains; spa_feature_t feat; const char *feat_name; const char *poolname; @@ -7078,6 +7080,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; + has_draid = B_FALSE; + has_draid_fdomains = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { @@ -7089,6 +7093,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; + if (feat == SPA_FEATURE_DRAID) + has_draid = B_TRUE; + if (feat == SPA_FEATURE_DRAID_FAIL_DOMAINS) + has_draid_fdomains = B_TRUE; } } @@ -7152,7 +7160,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, + &draid_nfgroup, 0)) == 0 && + (ndraid == 0 || has_draid || (error = SET_ERROR(ENOTSUP))) && + (draid_nfgroup == 0 || has_draid_fdomains || + (error = SET_ERROR(ENOTSUP))) && error == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) @@ -7303,6 +7315,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + for (int i = 0; i < draid_nfgroup; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID_FAIL_DOMAINS, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -7899,13 +7914,26 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } +/* + * This is called as a synctask to increment the draid_fail_domains feature flag + */ +static void +spa_draid_fdomains_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int nfgrp = (int)(uintptr_t)arg; + + for (int c = 0; c < nfgrp; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID_FAIL_DOMAINS, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { - uint64_t txg, ndraid = 0; + uint64_t txg, ndraid = 0, draid_nfgroup = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -7944,10 +7972,15 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) * dRAID is stored in the config and used when opening the spare. */ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, - rvd->vdev_children)) == 0) { + &draid_nfgroup, rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; + + if (draid_nfgroup > 0 && !spa_feature_is_enabled(spa, + SPA_FEATURE_DRAID_FAIL_DOMAINS)) + return (spa_vdev_exit(spa, vd, txg, ENOTSUP)); } else { return (spa_vdev_exit(spa, vd, txg, error)); } @@ -8034,8 +8067,15 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, (void *)(uintptr_t)ndraid, tx); + + if (draid_nfgroup > 0) + dsl_sync_task_nowait(spa->spa_dsl_pool, + spa_draid_fdomains_feature_incr, + (void *)(uintptr_t)draid_nfgroup, tx); + dmu_tx_commit(tx); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3480b884ea96..d78f19db54ed 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3425,23 +3425,51 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, /* leaf vdevs only */ continue; } + int children = vd->vdev_children; + int width = children; if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; + if (vd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = vd->vdev_tsd; + minref = vdc->vdc_nparity + 1; + children = vdc->vdc_children; + } } else { /* any kind of mirror */ minref = vd->vdev_children; } + /* + * For dRAID with failure domains, count failures + * only once for any i-th child failure in each failure + * group, but only if the failures threshold is not + * reached in any of the groups. + */ + boolean_t safe2skip = B_FALSE; + if (width > children && + vdev_draid_fail_domain_allowed(vd)) + safe2skip = B_TRUE; + space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, - cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); + for (int c = 0; c < children; c++) { + for (int i = c; i < width; i += children) { + vdev_t *cvd = vd->vdev_child[i]; + + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + boolean_t empty = + zfs_range_tree_is_empty( + cvd->vdev_dtl[s]); + mutex_exit(&cvd->vdev_dtl_lock); + + if (s == DTL_OUTAGE && !empty && + safe2skip) + break; + } } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); @@ -6290,6 +6318,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } +static int +vdev_get_child_idx(vdev_t *vd, uint64_t c_guid) +{ + for (int c = 0; c < vd->vdev_children; c++) + if (vd->vdev_child[c]->vdev_guid == c_guid) + return (c); + return (0); +} + int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { @@ -6396,6 +6433,25 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; + case VDEV_PROP_FDOMAIN: + case VDEV_PROP_FGROUP: + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + vd->vdev_top->vdev_ops == + &vdev_draid_ops) { + vdev_draid_config_t *vdc = + vd->vdev_top->vdev_tsd; + if (vdc->vdc_width == vdc->vdc_children) + continue; + int c_idx = vdev_get_child_idx( + vd->vdev_top, vd->vdev_guid); + vdev_prop_add_list(outnvl, propname, + NULL, prop == VDEV_PROP_FDOMAIN ? + (c_idx % vdc->vdc_children) : + (c_idx / vdc->vdc_children), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8588cfee3f7d..6e23c6e24bfb 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -23,6 +23,7 @@ * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2026, Seagate Technology, LLC. */ #include @@ -140,6 +141,58 @@ * the same for all groups (although some of the logic around computing * permutation numbers and drive offsets is more complicated). * + * === dRAID failure domains === + * + * If we put several slices alongside in a row and configure each disk in + * slice to be from different failure domain (for example an enclosure), we + * can then tolerate the failure of the whole domain -- only one device + * will be failed in every slice in this case. The column of such slices + * we will call failure group, and the row with such slices alongside we + * will call "big width row", width being multiple of children (W = C*n). + * + * Here's an example of configuration with 7 failure domains and two + * failure groups: + * + * 7 C disks in each slice, 2 slices in big 14 W rows + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 1 | 7 | 3 | 9 | 11| 5 | 13| 6 | 10| 4 | 8 | 0 | 12| 2 | device map 0 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 0 | gr1..| S | group 3 | gr4.. | S | row 0 + * c +-------+-------+-------+---+-------+-------+-------+---+ + * 0,1 | ..gr1 | group 2 | S | ..gr4 | group 5 | S | row 1 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 2 | 10| 12| 7 | 8 | 13| 11| 1 | 5 | 4 | 6 | 3 | 9 | 0 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 6 | gr7..| S | group 9 |gr10.. | S | row 2 + * c +-------+-------+-------+---+---------------+-------+---+ + * 2,3 | ..gr7 | group 8 | S |..gr10 | group 11 | S | row 3 + * +-------+---------------+---+-------+---------------+---+ + * failure group 0 failure group 1 + * + * In practice, there might be much more failure groups. And in theory, the + * width of the big rows can be much larger than curent limit of 255 imposed + * for the number of children. But we kept the same limit for now for the + * sake of simplicity of implementation. + * + * In order to preserve fast sequential resilvering in case of a disk failure, + * all failure groups much share all disks between themselves, and this is + * achieved by shuffling the disks between the groups. But only i-th disks + * in each group are shuffled between themselves, i.e. the disks from the + * same failure domains (enclosures). After that, they are shuffled within + * each group. Thus, no more than one disk from any failure domain can appear + * in any failure group as a result of this shuffling. In the above example, + * you won't find any tuple of (0, 7) or (1, 8) or (2, 9) or ... (6, 13) + * mapped to the same slice. This is done in vdev_draid_shuffle_perms(). + * + * Spare disks are evenly distributed among failure groups, so the number of + * spares should be multiple of the number of groups, and they are shared by + * all groups. However, to support domain failure, we cannot have more than + * nparity - 1 failed disks in any group, no matter if they are rebuilt to + * draid spares or not (the blocks of those spares can be mapped to the disks + * from the failed domain (enclosure), and we cannot tolerate more than + * nparity failures in any failure group). + * + * * N.B. The following array describes all valid dRAID permutation maps. * Each row is used to generate a permutation map for a different number * of children from a unique seed. The seeds were generated and carefully @@ -537,6 +590,73 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) return (0); } +static void +vdev_draid_swap_perms(uint8_t *perms, uint64_t i, uint64_t j) +{ + uint8_t val = perms[i]; + + perms[i] = perms[j]; + perms[j] = val; +} + +/* + * Shuffle every i-th disk in slices that lie alongside in the big width row, + * increasing disk indices in each next slice in the row accordingly. The + * input to this function is the array of ready permutations from + * vdev_draid_generate_perms(), so in order to correctly shuffle i-th disks, + * we need to locate their position first and build a map of their locations. + * + * Note: the same Fisher-Yates shuffle algorithm is used as in + * vdev_draid_generate_perms(). + */ +static void +vdev_draid_shuffle_perms(const draid_map_t *map, uint8_t *perms, uint64_t width) +{ + uint64_t cn = map->dm_children; + uint64_t n = width / cn; + uint64_t nperms = map->dm_nperms / n * n; + + if (width <= cn) + return; + + VERIFY3U(width, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(width, <=, VDEV_DRAID_MAX_CHILDREN); + ASSERT0(width % cn); + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + + uint8_t *cmap = kmem_alloc(n, KM_SLEEP); + + for (int i = 0; i < nperms; i += n) { + for (int j = 0; j < cn; j++) { + + /* locate position of the same child in other slices */ + for (int k = n - 1; k > 0; k--) + for (int l = 0; l < cn; l++) + if (perms[(i+k) * cn + l] == + perms[(i+0) * cn + j]) + cmap[k] = l; + cmap[0] = j; + + /* increase index values for slices on the right */ + for (int k = n - 1; k > 0; k--) + perms[(i+k) * cn + cmap[k]] += k * cn; + + /* shuffle */ + for (int k = n - 1; k > 0; k--) { + int l = vdev_draid_rand(draid_seed) % (k + 1); + if (k == l) + continue; + vdev_draid_swap_perms(perms, + (i+k) * cn + cmap[k], + (i+l) * cn + cmap[l]); + } + } + } + + kmem_free(cmap, n); +} + /* * Lookup the fixed draid_map_t for the requested number of children. */ @@ -560,17 +680,26 @@ static void vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, uint8_t **base, uint64_t *iter) { + uint64_t n = vdc->vdc_width / vdc->vdc_children; uint64_t ncols = vdc->vdc_children; - uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + uint64_t nperms = (vdc->vdc_nperms / n) * n; + uint64_t poff = pindex % (nperms * ncols); + + ASSERT3P(nperms, >=, ncols * n); - *base = vdc->vdc_perms + (poff / ncols) * ncols; - *iter = poff % ncols; + *base = vdc->vdc_perms + (poff / (ncols * n)) * (ncols * n); + *iter = (poff % ncols) + (pindex % n) * ncols; } static inline uint64_t vdev_draid_permute_id(vdev_draid_config_t *vdc, uint8_t *base, uint64_t iter, uint64_t index) { + if (vdc->vdc_width > vdc->vdc_children) { + uint64_t off = (iter / vdc->vdc_children) * vdc->vdc_children; + return (base[(index + iter) % vdc->vdc_children + off]); + } + return ((base[index] + iter) % vdc->vdc_children); } @@ -949,7 +1078,8 @@ vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, * - so we need to find the row where this IO group target begins */ *perm = group / ngroups; - uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + uint64_t n = vdc->vdc_width / vdc->vdc_children; + uint64_t row = ((*perm / n) * ((groupwidth * ngroups) / ndisks)) + (((group % ngroups) * groupwidth) / ndisks); return (((rowheight_sectors * row) + @@ -1170,8 +1300,11 @@ vdev_draid_min_asize(vdev_t *vd) ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + uint64_t ndisks = vdc->vdc_ndisks * + (vdc->vdc_width / vdc->vdc_children); + return (VDEV_DRAID_REFLOW_RESERVE + - (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); + (vd->vdev_min_asize + ndisks - 1) / ndisks); } /* @@ -1535,7 +1668,7 @@ vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, int open_errors = 0; if (nparity > VDEV_DRAID_MAXPARITY || - vd->vdev_children < nparity + 1) { + vdc->vdc_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } @@ -1548,12 +1681,26 @@ vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children_subset(vd, vdev_draid_open_children); vdev_open_children_subset(vd, vdev_draid_open_spares); - /* Verify enough of the children are available to continue. */ - for (int c = 0; c < vd->vdev_children; c++) { - if (vd->vdev_child[c]->vdev_open_error != 0) { - if ((++open_errors) > nparity) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (SET_ERROR(ENXIO)); + /* + * Verify enough of the children are available to continue. + * If several disks got failed on i-th position in each slice in the + * big width row (failure groups) - they are counted as one failure, + * but only if the failures threshold is not reached in any group. + */ + boolean_t safe2skip = B_FALSE; + if (vdc->vdc_width > vdc->vdc_children && + vdev_draid_fail_domain_allowed(vd)) + safe2skip = B_TRUE; + for (int c = 0; c < vdc->vdc_children; c++) { + for (int i = c; i < vdc->vdc_width; i += vdc->vdc_children) { + if (vd->vdev_child[i]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = + VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + if (safe2skip) + break; } } } @@ -1588,6 +1735,19 @@ vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * vdc->vdc_groupsz); + /* + * For failure groups with multiple silices in the big width row, + * round down to slice size and multiply on the number of slices + * in the "big width row" so that each failure group would have + * the same number of slices. + */ + if (vdc->vdc_width > vdc->vdc_children) { + uint64_t slicesz = vdc->vdc_devslicesz * vdc->vdc_ndisks; + uint64_t n = (vdc->vdc_width / vdc->vdc_children); + *asize = (*asize / slicesz) * slicesz * n; + *max_asize = (*max_asize / slicesz) * slicesz * n; + } + return (0); } @@ -1674,10 +1834,11 @@ vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) */ int vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, - uint64_t next_vdev_id) + uint64_t *nfgroupp, uint64_t next_vdev_id) { uint64_t draid_nspares = 0; uint64_t ndraid = 0; + uint64_t nfgroup = 0; int error; for (uint64_t i = 0; i < vd->vdev_children; i++) { @@ -1685,13 +1846,17 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, if (cvd->vdev_ops == &vdev_draid_ops) { vdev_draid_config_t *vdc = cvd->vdev_tsd; - draid_nspares += vdc->vdc_nspares; + draid_nspares += vdc->vdc_nspares * + (vdc->vdc_width / vdc->vdc_children); ndraid++; + if (vdc->vdc_width > vdc->vdc_children) + nfgroup++; } } if (draid_nspares == 0) { *ndraidp = ndraid; + *nfgroupp = nfgroup; return (0); } @@ -1718,7 +1883,8 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, continue; vdev_draid_config_t *vdc = cvd->vdev_tsd; - uint64_t nspares = vdc->vdc_nspares; + uint64_t nspares = vdc->vdc_nspares * + (vdc->vdc_width / vdc->vdc_children); uint64_t nparity = vdc->vdc_nparity; for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { @@ -1759,6 +1925,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, kmem_free(new_spares, sizeof (*new_spares) * n); *ndraidp = ndraid; + *nfgroupp = nfgroup; return (0); } @@ -2100,7 +2267,7 @@ vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT(vd->vdev_ops == &vdev_draid_ops); - if (faulted > vdc->vdc_nparity) + if (faulted > vdc->vdc_nparity * (vdc->vdc_width / vdc->vdc_children)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) @@ -2213,10 +2380,14 @@ vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); vdev_draid_config_t *vdc = vd->vdev_tsd; + int fgrps = vdc->vdc_width / vdc->vdc_children; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, + vdc->vdc_nspares * fgrps); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, vdc->vdc_children); } /* @@ -2237,24 +2408,30 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) return (SET_ERROR(EINVAL)); } - uint_t children; + uint_t width; + uint64_t children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0 || children == 0 || - children > VDEV_DRAID_MAX_CHILDREN) { + &child, &width) != 0 || width == 0) { return (SET_ERROR(EINVAL)); } - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || - nspares > 100 || nspares > (children - (ndata + nparity))) { - return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, &children)) { + children = width; + if (children > VDEV_DRAID_MAX_CHILDREN) + return (SET_ERROR(EINVAL)); } - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || - ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + if (children == 0 || width % children != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100) { return (SET_ERROR(EINVAL)); } + nspares /= (width / children); + /* * Validate the minimum number of children exist per group for the * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). @@ -2262,6 +2439,11 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) if (children < (ndata + nparity + nspares)) return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + /* * Create the dRAID configuration using the pool nvlist configuration * and the fixed mapping for the correct number of children. @@ -2279,6 +2461,7 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) vdc->vdc_nspares = nspares; vdc->vdc_children = children; vdc->vdc_ngroups = ngroups; + vdc->vdc_width = width; vdc->vdc_nperms = map->dm_nperms; error = vdev_draid_generate_perms(map, &vdc->vdc_perms); @@ -2287,6 +2470,9 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) return (SET_ERROR(EINVAL)); } + if (width > children) + vdev_draid_shuffle_perms(map, vdc->vdc_perms, width); + /* * Derived constants. */ @@ -2324,7 +2510,7 @@ vdev_draid_nparity(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; - return (vdc->vdc_nparity); + return (vdc->vdc_nparity * (vdc->vdc_width / vdc->vdc_children)); } static uint64_t @@ -2332,7 +2518,7 @@ vdev_draid_ndisks(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; - return (vdc->vdc_ndisks); + return (vdc->vdc_ndisks * (vdc->vdc_width / vdc->vdc_children)); } vdev_ops_t vdev_draid_ops = { @@ -2436,17 +2622,25 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) vdev_t *tvd = vds->vds_draid_vdev; vdev_draid_config_t *vdc = tvd->vdev_tsd; + uint64_t n = vdc->vdc_width / vdc->vdc_children; + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); - ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares * n); uint8_t *base; uint64_t iter; - uint64_t perm = physical_offset / vdc->vdc_devslicesz; + uint64_t perm = (physical_offset / vdc->vdc_devslicesz) * n; + + /* + * Adjust permutation so that it points to the correct slice in the + * big width row. + */ + perm += vds->vds_spare_id / vdc->vdc_nspares; vdev_draid_get_perm(vdc, perm, &base, &iter); uint64_t cid = vdev_draid_permute_id(vdc, base, iter, - (tvd->vdev_children - 1) - vds->vds_spare_id); + (vdc->vdc_children - 1) - (vds->vds_spare_id % vdc->vdc_nspares)); vdev_t *cvd = tvd->vdev_child[cid]; if (cvd->vdev_ops == &vdev_draid_spare_ops) @@ -2455,6 +2649,40 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) return (cvd); } +/* + * Returns true if no failure group reached failures threshold so that + * enclosure failure cannot be tolerated anymore. Used spares are counted + * as failures because in case of enclosure failure their blocks can belong + * to the disks from that enclosure and can be lost. + */ +boolean_t +vdev_draid_fail_domain_allowed(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3P(vdc->vdc_width, >, vdc->vdc_children); + + int counter = 0; + + for (int c = 0; c < vdc->vdc_width; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if ((c % vdc->vdc_children) == 0) + counter = 0; + + if (cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_draid_spare_ops || + !vdev_readable(cvd)) + counter++; + + if (counter > vdc->vdc_nparity) + return (B_FALSE); + } + + return (B_TRUE); +} + static void vdev_draid_spare_close(vdev_t *vd) { @@ -2496,7 +2724,8 @@ vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) return (SET_ERROR(EINVAL)); - if (vds->vds_spare_id >= vdc->vdc_nspares) + if (vds->vds_spare_id >= + vdc->vdc_nspares * (vdc->vdc_width / vdc->vdc_children)) return (SET_ERROR(EINVAL)); /* diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 520ddd692bda..17d9f2611611 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3271,11 +3271,18 @@ raidz_simulate_failure(int physical_width, int original_width, int ashift, static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { + vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; - int physical_width = zio->io_vd->vdev_children; + int physical_width = vd->vdev_children; + int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; + + if (vd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = vd->vdev_tsd; + physical_width = vdc->vdc_children; + } + int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; - int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " @@ -3465,9 +3472,17 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) static int vdev_raidz_combrec(zio_t *zio) { - int nparity = vdev_get_nparity(zio->io_vd); + vdev_t *vd = zio->io_vd; + int nparity = vdev_get_nparity(vd); raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; + + if (vd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = vd->vdev_tsd; + nparity = vdc->vdc_nparity; + physical_width = vdc->vdc_children; + } + int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8394bc4bcda0..5f6cb7e3dfbb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -423,6 +423,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', + 'zpool_create_draid_005_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', @@ -913,9 +914,10 @@ timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', - 'redundancy_draid3', 'redundancy_draid_damaged1', + 'redundancy_draid3', 'redundancy_draid_width', 'redundancy_draid_damaged1', 'redundancy_draid_damaged2', 'redundancy_draid_spare1', - 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', + 'redundancy_draid_spare2', 'redundancy_draid_spare3', + 'redundancy_draid_spare4', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2717bf53d0b1..c11d8dd545b8 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -124,7 +124,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', 'fault_limits', 'scrub_after_resilver', - 'suspend_on_probe_errors', 'suspend_resume_single', 'zpool_status_-s'] + 'suspend_on_probe_errors', 'suspend_resume_single', 'suspend_draid_fgroups', + 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index e3fcce9840d9..63e3caef7ad1 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1080,6 +1080,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \ functional/cli_root/zpool_create/zpool_create_encrypted.ksh \ functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \ @@ -1602,6 +1603,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/scrub_after_resilver.ksh \ functional/fault/suspend_on_probe_errors.ksh \ functional/fault/suspend_resume_single.ksh \ + functional/fault/suspend_draid_fgroups.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ @@ -1895,12 +1897,14 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ + functional/redundancy/redundancy_draid_width.ksh \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ functional/redundancy/redundancy_draid_spare3.ksh \ + functional/redundancy/redundancy_draid_spare4.ksh \ functional/redundancy/redundancy_mirror.ksh \ functional/redundancy/redundancy_raidz1.ksh \ functional/redundancy/redundancy_raidz2.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh new file mode 100755 index 000000000000..b6115e5c5e36 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_005_pos.ksh @@ -0,0 +1,149 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify creation of several failure groups in one big width row. +# +# STRATEGY: +# 1) Test valid stripe/spare/children/width combinations. +# 2) Test invalid stripe/spare/children/width combinations outside the +# allowed limits. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $draid_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid:#d:#c:#w:#s '" + +log_onexit cleanup + +mkdir $TESTDIR + +# Generate 10 random valid configurations to test. +for (( i = 0; i < 10; i++ )); do + parity=$(random_int_between 1 3) + spares=$(random_int_between 0 3) + data=$(random_int_between 1 10) + n=$(random_int_between 2 4) + + (( min_children = (data + parity + spares) )) + (( max_children = 64 / n )) + children=$(random_int_between $min_children $max_children) + (( width = (children * n) )) + (( spares *= n )) + + draid="draid${parity}:${data}d:${children}c:${width}w:${spares}s" + + draid_vdevs=$(echo $TESTDIR/file.{1..$width}) + log_must truncate -s $MINVDEVSIZE $draid_vdevs + + log_must zpool create $TESTPOOL $draid $draid_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + # create the same pool with fgroup keywords + draid_fgrp_vdevs="" + for (( g = 0; g < n; g++ )); do + draid_fgrp_vdevs+="fgroup " + for (( c = 0; c < children; c++ )); do + draid_fgrp_vdevs+="$TESTDIR/file.$((c + (g * children) + 1)) " + done + done + + log_must zpool create $TESTPOOL $draid $draid_fgrp_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + # create the same pool with fdomain keywords + draid_fdom_vdevs="" + for (( c = 0; c < children; c++ )); do + draid_fdom_vdevs+="fdomain " + for (( g = 0; g < n; g++ )); do + draid_fdom_vdevs+="$TESTDIR/file.$((c + (g * children) + 1)) " + done + done + + log_must zpool create $TESTPOOL $draid $draid_fgrp_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + rm -f $draid_vdevs +done + +children=32 +draid_vdevs=$(echo $TESTDIR/file.{1..$children}) +draid_vdevs0=$(echo $TESTDIR/file.{1..$((children / 2))}) +draid_vdevs1=$(echo $TESTDIR/file.{$((children / 2 + 1))..$children}) +draid_vdevs0_less=$(echo $TESTDIR/file.{1..$((children / 2 - 1))}) +draid_vdevs1_more=$(echo $TESTDIR/file.{$((children / 2))..$children}) +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +# Exceeds maximum data disks (limited by total children) +log_must zpool create $TESTPOOL draid2:14d:32w $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:14d:33w $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:14d:31w $draid_vdevs + +# One fdomain or fgroup keyword is not enough +log_mustnot zpool create $TESTPOOL draid2:14d:32w fdomain $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:14d:32w fgroup $draid_vdevs + +# The number of devices should be equal after each fdomain or fgroup +log_mustnot zpool create $TESTPOOL draid2:14d:32w fdomain $draid_vdevs0_less fdomain $draid_vdevs1_more +log_mustnot zpool create $TESTPOOL draid2:14d:32w fgroup $draid_vdevs0_less fgroup $draid_vdevs1_more + +# Keywords cannot be mixed +log_mustnot zpool create $TESTPOOL draid2:14d:32w fdomain $draid_vdevs0 fgroup $draid_vdevs1 + +# Failure groups and domains can be inferred from keywords +log_must zpool create $TESTPOOL draid2:14d fgroup $draid_vdevs0 fgroup $draid_vdevs1 +log_must poolexists $TESTPOOL +log_must test "$(get_vdev_prop failure_group $TESTPOOL draid2:14d:16c:32w-0)" == "-" +log_must destroy_pool $TESTPOOL +log_must zpool create $TESTPOOL draid1 fdomain $draid_vdevs0 fdomain $draid_vdevs1 +log_must poolexists $TESTPOOL +log_must test "$(get_vdev_prop failure_domain $TESTPOOL draid1:1d:2c:32w-0)" == "-" +log_must destroy_pool $TESTPOOL + +# Width matches vdevs, but it must be multiple of children +log_mustnot zpool create $TESTPOOL draid2:13d:15c:32w $draid_vdevs + +log_pass "'zpool create draid:#d:#c:#w:#s '" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index f59104e19805..79992227169e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -72,6 +72,8 @@ typeset -a properties=( io_n io_t slow_io_events + failure_domain + failure_group slow_io_n slow_io_t trim_support diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 99a4556f70d5..63b674a95f21 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -92,6 +92,7 @@ typeset -a properties=( "feature@log_spacemap" "feature@device_rebuild" "feature@draid" + "feature@draid_failure_domains" "feature@redaction_list_spill" "feature@dynamic_gang_header" "feature@physical_rewrite" diff --git a/tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh b/tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh new file mode 100755 index 000000000000..fe2dfc14d125 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/suspend_draid_fgroups.ksh @@ -0,0 +1,163 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# Copyright (c) 2026, Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/blkdev.shlib + +# +# DESCRIPTION: Verify that 4 disks removed from a draid3 with failure +# groups, when they are removed from a group, will suspend the pool. +# +# STRATEGY: +# 1. Disable ZED -- this test is focused on vdev_probe errors. +# 2. Create a draid3 pool whith random number of failure groups, from 2 to 6, +# where 4 disks can be removed (i.e., using scsi_debug). +# 3. Add some data to it for a resilver workload. +# 4. Replace one of the child vdevs to start a replacing vdev. +# 5. During the resilver, remove 4 disks, including one from the replacing vdev, +# from a failure group. +# 6. Verify that the pool is suspended. +# + +DEV_SIZE_MB=1024 + +DRAID_FGRP_CNT=$(random_int_between 2 6) +FILE_VDEV_CNT=$((8 * $DRAID_FGRP_CNT)) +DRAID="draid3:8c:${FILE_VDEV_CNT}w" +FILE_VDEV_SIZ=256M + +function cleanup +{ + destroy_pool $TESTPOOL + if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then + log_must eval "echo running > /sys/block/$sd/device/state" + fi + unload_scsi_debug + rm -f $DATA_FILE + for i in {0..$((FILE_VDEV_CNT - 1))}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + zed_start +} + +log_onexit cleanup + +log_assert "dRAID vdev with failure groups probe errors for more disks than" \ + "parity in a group should suspend a pool" + +log_note "Stoping ZED process" +zed_stop +zpool events -c + +# Make a debug device that we can "unplug" and lose 4 drives at once +unload_scsi_debug +load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b' +sd=$(get_debug_device) + +# Create 4 partitions that match the FILE_VDEV_SIZ +parted "/dev/${sd}" --script mklabel gpt +parted "/dev/${sd}" --script mkpart primary 0% 25% +parted "/dev/${sd}" --script mkpart primary 25% 50% +parted "/dev/${sd}" --script mkpart primary 50% 75% +parted "/dev/${sd}" --script mkpart primary 75% 100% +block_device_wait "/dev/${sd}" +blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4" + +# Create file vdevs +typeset -a filedevs +for i in {0..$((FILE_VDEV_CNT - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s $FILE_VDEV_SIZ $device + # Use all but the last one for pool create + if [[ $i -lt $((FILE_VDEV_CNT - 4)) ]]; then + filedevs[${#filedevs[*]}+1]=$device + fi +done + +# Create a draid3 pool that we can pull 4 disks from +log_must zpool create -f $TESTPOOL $DRAID ${filedevs[@]} $blkdevs +sync_pool $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" +SECONDS=0 +log_must fill_fs $MNTPOINT 1 200 4096 10 R +log_note "fill_fs took $SECONDS seconds" +sync_pool $TESTPOOL + +# Start a replacing vdev, but suspend the resilver +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-$((FILE_VDEV_CNT - 1)) + +# Remove 4 disks all at once +log_must eval "echo offline > /sys/block/${sd}/device/state" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + +# Add some writes to drive the vdev probe errors +log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1 + +# Wait until sync starts, and the pool suspends +log_note "waiting for pool to suspend" +typeset -i tries=30 +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + zpool status -s + log_fail "UNEXPECTED -- pool did not suspend" + fi + sleep 1 +done +log_note $(kstat_pool $TESTPOOL state) + +# Put the missing disks back into service +log_must eval "echo running > /sys/block/$sd/device/state" + +# Clear the vdev error states, which will reopen the vdevs and resume the pool +log_must zpool clear $TESTPOOL + +# Wait until the pool resumes +log_note "waiting for pool to resume" +tries=30 +until [[ $(kstat_pool $TESTPOOL state) != "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool did not resume" + fi + sleep 1 +done +log_must zpool wait -t resilver $TESTPOOL +sync_pool $TESTPOOL + +# Make sure a pool scrub comes back clean +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_pass "dRAID vdev with failure groups probe errors for more disks than" \ + "parity in a group should suspend a pool" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 65435554bdbe..53e2efffac2d 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -123,7 +123,7 @@ function setup_test_env log_note "Filling up the filesystem ..." typeset -i i=0 typeset file=$TESTDIR/file - typeset -i limit + typeset -li limit (( limit = $(get_prop available $pool) / 2 )) while true ; do @@ -206,15 +206,17 @@ function is_data_valid # # $1 pool name # $2 devices count +# $3 starting device index (optional, counts from 0) # -function get_vdevs #pool cnt +function get_vdevs #pool cnt off { typeset pool=$1 typeset -i cnt=$2 + typeset -i off=$3 typeset all_devs=$(zpool iostat -v $pool | awk '{print $1}' | \ grep -vEe "^pool$|^capacity$|^mirror\-[0-9]$|^raidz[1-3]\-[0-9]$|^draid[1-3].*\-[0-9]$|---" \ - -e "/old$|^$pool$") + -e "/old$|^$pool$" | tail -n +"$((off + 1))") typeset -i i=0 typeset vdevs while ((i < cnt)); do @@ -282,6 +284,43 @@ function damage_devs sync_pool $pool } +# +# Damage the pool's virtual device files starting from i-th one. +# +# $1 pool name +# $2 failing devices count +# $3 starting from which device (counts from 0) +# $3 damage vdevs method, if not null, we keep +# the label for the vdevs +# +function damage_devs_off +{ + typeset pool=$1 + typeset -i cnt=$2 + typeset -i off=$3 + typeset label="$4" + typeset vdevs + typeset -i bs_count=$(((MINVDEVSIZE / 1024) - 4096)) + + vdevs=$(get_vdevs $pool $cnt $off) + typeset dev + if [[ -n $label ]]; then + for dev in $vdevs; do + log_note "damage $dev (keeping label)" + log_must dd if=/dev/zero of=$dev seek=512 bs=1024 \ + count=$bs_count conv=notrunc >/dev/null 2>&1 + done + else + for dev in $vdevs; do + log_note "damage $dev" + log_must dd if=/dev/zero of=$dev bs=1024 \ + count=$bs_count conv=notrunc >/dev/null 2>&1 + done + fi + + sync_pool $pool +} + # # Clear errors in the pool caused by data corruptions # @@ -323,6 +362,26 @@ function remove_devs sync_pool $pool } +# +# Remove the specified pool's virtual device files starting from i-th one +# +# $1 Pool name +# $2 Missing devices count +# +function remove_devs_off +{ + typeset pool=$1 + typeset -i cnt=$2 + typeset -i off=$3 + typeset vdevs + + vdevs=$(get_vdevs $pool $cnt $off) + log_note "remove $vdevs" + log_must rm -f $vdevs + + sync_pool $pool +} + # # Recover the bad or missing device files in the pool # diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh new file mode 100755 index 000000000000..0f491ebda3c7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh @@ -0,0 +1,150 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 by Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify resilver to dRAID distributed spares. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can: +# - sustain N failures (1-3) * n, and +# - has N * n distributed spares to replace all faulted vdevs +# - n is the number of fail groups in the dRAID +# - failures in the groups happen at the same time +# b. Fill the pool with data +# c. Systematically fault a vdev, then replace it with a spare +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify resilver to dRAID distributed spares" + +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=$(random_int_between 1 3) + spares=$(random_int_between 1 $parity) + data=$(random_int_between 1 8) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 16) + n=$(random_int_between 2 4) + (( width = children * n )) + off=$(random_int_between 0 $((children - parity - 1))) + + (( spares *= n )) + + draid="draid${parity}:${data}d:${children}c:${width}w:${spares}s" + + setup_test_env $TESTPOOL $draid $width + + for (( i=0; i < $spares; i+=$n )); do + + for (( j=$i; j < $((i+n)); j++ )); do + fault_vdev="$BASEDIR/vdev$((i / n + (j % n) * children + off))" + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + done + + for (( j=$i; j < $((i+n)); j++ )); do + fault_vdev="$BASEDIR/vdev$((i / n + (j % n) * children + off))" + spare_vdev="draid${parity}-0-${j}" + log_must zpool replace -w $flags $TESTPOOL \ + $fault_vdev $spare_vdev + done + + for (( j=$i; j < $((i+n)); j++ )); do + fault_vdev="$BASEDIR/vdev$((i / n + (j % n) * children + off))" + spare_vdev="draid${parity}-0-${j}" + log_must check_vdev_state spare-$j "DEGRADED" + log_must check_vdev_state $spare_vdev "ONLINE" + log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" + log_must zpool detach $TESTPOOL $fault_vdev + done + + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + done + + # Fail remaining drives as long as parity permits. + faults_left=$parity + off=0 + for (( failed=$((spares/n)); failed < $parity; failed++ )); do + # we can still fail disks + (( ++off )) + for (( i=0; i < $n; i++ )); do + fault_vdev="$BASEDIR/vdev$((i * children + children - 1 - off))" + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + (( faults_left > 0 && faults_left-- )) + done + done + + # Make sure that faults_left failures are still allowed, but no more. + for (( i=0; i < $n; i++ )); do + fault_vdev="$BASEDIR/vdev$((i * children + children - 1))" + log_must zpool offline -f $TESTPOOL $fault_vdev + if (( $i < $faults_left)); then + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + else + log_must check_vdev_state $TESTPOOL $fault_vdev "DEGRADED" + break + fi + + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + done + + log_must is_data_valid $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh new file mode 100755 index 000000000000..e043eb5cfcec --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_width.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 by Seagate Technology, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid vdev with n failure grups can withstand n devices failing +# or missing, each device being i-th one in each group. +# +# STRATEGY: +# 1. Create N(>3,<6) * n virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damage any n virtual disk files with the same offset in each group. +# 6. Verify the data is correct. +# + +verify_runnable "global" + +log_assert "Verify draid pool with n failure groups can withstand n i-th" \ + "devices failing in each group." +log_onexit cleanup + +typeset -i children=$(random_int_between 3 6) +typeset -i fgroups=$(random_int_between 2 4) +typeset -i ith=$(random_int_between 0 $((children - 1))) +typeset -i width=$((children * fgroups)) +setup_test_env $TESTPOOL draid:${children}c:${width}w $width + +# +# Inject data corruption errors for draid pool +# +for (( i=0; i<$fgroups; i=i+1 )); do + damage_devs_off $TESTPOOL 1 "$((ith + children*i))" "label" +done +log_must is_data_valid $TESTPOOL +log_must clear_errors $TESTPOOL + +# +# Inject bad device errors for draid pool +# +for (( i=0; i<$fgroups; i=i+1 )); do + damage_devs_off $TESTPOOL 1 "$((ith + children*i))" +done +log_must is_data_valid $TESTPOOL +log_must recover_bad_missing_devs $TESTPOOL 1 + +# +# Inject missing device errors for draid pool +# +for (( i=0; i<$fgroups; i=i+1 )); do + remove_devs_off $TESTPOOL 1 "$((ith + children*i))" +done +log_must is_data_valid $TESTPOOL + +log_pass "draid:${children}c:${width}w pool can withstand $fgroups i-th" \ + "devices failing passed."