Skip to content

Commit 0f25b76

Browse files
committed
draid: add failure domains support
Currently, the only way to tolerate the failure of the whole enclosure is to configure several draid vdevs in the pool, each vdev having disks from different enclosures. But this essentially degrades draid to raidz and defeats the purpose having fast sequential resilvering on wide pools with draid. This patch allows to configure several children groups in the same row in one draid vdev. In each such group, let's call it failure group, the user can configure disks belonging to different enclosures - failure domains. For example, in case of 10 enclosures with 10 disks each, the user can put 1st disk from each enclosure into 1st group, 2nd disk from each enclosure into 2nd group, and so on. If one enclosure fails, only one disk from each group would fail, which won't affect draid operation, and each group would have enough redundancy to recover the stored data. Of course, in case of draid2 - two enclosures can fail at a time, in case of draid3 - three enclosures (provided there are no other disk failures in each group). In order to preserve fast sequential resilvering in case of a disk failure, the groups much share all disks between themselves, and this is achieved by shuffling the disks between the groups. But only i-th disks in each group are shuffled between themselves, i.e. the disks from the same enclosures, after that they are shuffled within each group, like it is done today in an ordinary draid. Thus, no more than one disk from any enclosure can appear in any failure group as a result of this shuffling. For example, here's how the pool status output looks like in case of two `draid1:2d:4c:1s` groups: NAME STATE READ WRITE CKSUM pool1 ONLINE 0 0 0 draid1:2d:4c:1s:8w-0 ONLINE 0 0 0 enc0d0 ONLINE 0 0 0 enc1d0 ONLINE 0 0 0 enc2d0 ONLINE 0 0 0 enc3d0 ONLINE 0 0 0 enc0d1 ONLINE 0 0 0 enc1d1 ONLINE 0 0 0 enc2d1 ONLINE 0 0 0 enc3d1 ONLINE 0 0 0 spares draid1-0-0 AVAIL draid1-0-1 AVAIL The number of failure groups is specified indirectly via the new width parameter in draid vdev configuration descriptor, which is the total number of disks and which is multiple of children in each group. This multiple is the number of groups (width / children). Doing it this way allows the user conveniently see how many disks draid has in an instant. Spare disks are evenly distributed among failure groups, so the number of spares should be multiple of the number of groups, and they are shared by all groups. However, to support domain failure, we cannot have more than nparity - 1 failed disks in any group, no matter if they are rebuilt to draid spares or not (the blocks of those spares can be mapped to the disks from the failed domain (enclosure), and we cannot tolerate more than nparity failures in any failure group). Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Signed-off-by: Andriy Tkachuk <andriy.tkachuk@seagate.com> Closes #11969.
1 parent 7e33476 commit 0f25b76

29 files changed

+1395
-113
lines changed

cmd/zpool/zpool_main.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3528,6 +3528,11 @@ show_import(nvlist_t *config, boolean_t report_error)
35283528
"accessed by another system.\n"));
35293529
break;
35303530

3531+
case ZPOOL_STATUS_FAULTED_FDOM_R:
3532+
(void) printf_color(ANSI_YELLOW, gettext("One or more failure "
3533+
" domains are faulted.\n"));
3534+
break;
3535+
35313536
case ZPOOL_STATUS_FAULTED_DEV_R:
35323537
case ZPOOL_STATUS_FAULTED_DEV_NR:
35333538
(void) printf_color(ANSI_YELLOW, gettext("One or more devices "
@@ -8030,7 +8035,7 @@ zpool_do_online(int argc, char **argv)
80308035

80318036
if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
80328037
(void) fprintf(stderr, gettext("failed to open pool "
8033-
"\"%s\""), poolname);
8038+
"\"%s\"\n"), poolname);
80348039
return (1);
80358040
}
80368041

@@ -8174,7 +8179,7 @@ zpool_do_offline(int argc, char **argv)
81748179

81758180
if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
81768181
(void) fprintf(stderr, gettext("failed to open pool "
8177-
"\"%s\""), poolname);
8182+
"\"%s\"\n"), poolname);
81788183
return (1);
81798184
}
81808185

@@ -10715,6 +10720,18 @@ print_status_reason(zpool_handle_t *zhp, status_cbdata_t *cbp,
1071510720
"or use 'zpool clear' to mark the device\n\trepaired.\n"));
1071610721
break;
1071710722

10723+
case ZPOOL_STATUS_FAULTED_FDOM_R:
10724+
(void) snprintf(status, ST_SIZE,
10725+
gettext("One or more failure domains are faulted. "
10726+
"The storage devices may be\n\tintact. Sufficient "
10727+
"replicas exist for the pool to continue functioning\n\t"
10728+
"in a degraded state.\n"));
10729+
(void) snprintf(action, AC_SIZE,
10730+
gettext("Replace the faulted domain device, "
10731+
"or use 'zpool clear' to mark domain\n\tstorage devices "
10732+
"repaired.\n"));
10733+
break;
10734+
1071810735
case ZPOOL_STATUS_FAULTED_DEV_NR:
1071910736
(void) snprintf(status, ST_SIZE,
1072010737
gettext("One or more devices are "

cmd/zpool/zpool_vdev.c

Lines changed: 182 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,36 +1323,44 @@ is_grouping(const char *type, int *mindev, int *maxdev)
13231323
* Extract the configuration parameters encoded in the dRAID type and
13241324
* use them to generate a dRAID configuration. The expected format is:
13251325
*
1326-
* draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
1326+
* draid[<parity>][:<data>d][:<children>c][:<spares>s][:<width>w]
13271327
*
13281328
* The intent is to be able to generate a good configuration when no
13291329
* additional information is provided. The only mandatory component
13301330
* of the 'type' is the 'draid' prefix. If a value is not provided
13311331
* then reasonable defaults are used. The optional components may
1332-
* appear in any order but the d/s/c suffix is required.
1332+
* appear in any order but the d/s/c/w suffix is required.
13331333
*
13341334
* Valid inputs:
13351335
* - data: number of data devices per group (1-255)
1336-
* - parity: number of parity blocks per group (1-3)
1337-
* - spares: number of distributed spare (0-100)
1338-
* - children: total number of devices (1-255)
1336+
* - parity: number of parity devices per group (1-3)
1337+
* - children: total number of devices in slice (1-255)
1338+
* - width: total number of devices, multiple of children (1-255 for now)
1339+
* - spares: number of distributed spare devices (0-100), must be
1340+
* multiple of failure groups (width / children)
13391341
*
13401342
* Examples:
13411343
* - zpool create tank draid <devices...>
13421344
* - zpool create tank draid2:8d:51c:2s <devices...>
1345+
* - zpool create tank draid2:8d:12c:96w:8s <devices...>
13431346
*/
13441347
static int
1345-
draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
1348+
draid_config_by_type(nvlist_t *nv, const char *type, uint64_t width,
1349+
int nfgroup, int nfdomain)
13461350
{
13471351
uint64_t nparity;
13481352
uint64_t nspares = 0;
13491353
uint64_t ndata = UINT64_MAX;
13501354
uint64_t ngroups = 1;
1355+
uint64_t children = 0;
13511356
long value;
13521357

13531358
if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
13541359
return (EINVAL);
13551360

1361+
if (nfgroup && nfdomain) /* must be only one of two or none */
1362+
return (EINVAL);
1363+
13561364
nparity = (uint64_t)get_parity(type);
13571365
if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
13581366
fprintf(stderr,
@@ -1376,24 +1384,35 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
13761384
return (EINVAL);
13771385
}
13781386

1379-
/* Expected non-zero value with c/d/s suffix */
1387+
/* Expected non-zero value with c/d/s/w suffix */
13801388
value = strtol(p, &end, 10);
13811389
char suffix = tolower(*end);
13821390
if (errno != 0 ||
1383-
(suffix != 'c' && suffix != 'd' && suffix != 's')) {
1391+
(suffix != 'c' && suffix != 'd' && suffix != 's' &&
1392+
suffix != 'w')) {
13841393
(void) fprintf(stderr, gettext("invalid dRAID "
1385-
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
1386-
type);
1394+
"syntax; expected [:<number><c|d|s|w>], "
1395+
"not '%s'\n"), type);
13871396
return (EINVAL);
13881397
}
13891398

13901399
if (suffix == 'c') {
1391-
if ((uint64_t)value != children) {
1400+
if ((uint64_t)value > width ||
1401+
width % (uint64_t)value != 0) {
13921402
fprintf(stderr,
1393-
gettext("invalid number of dRAID children; "
1403+
gettext("invalid number of dRAID disks; "
1404+
"multiple of %llu required but %llu "
1405+
"provided\n"), (u_longlong_t)value,
1406+
(u_longlong_t)width);
1407+
return (EINVAL);
1408+
}
1409+
children = value;
1410+
} else if (suffix == 'w') {
1411+
if ((uint64_t)value != width) {
1412+
fprintf(stderr,
1413+
gettext("invalid number of dRAID disks; "
13941414
"%llu required but %llu provided\n"),
1395-
(u_longlong_t)value,
1396-
(u_longlong_t)children);
1415+
(u_longlong_t)value, (u_longlong_t)width);
13971416
return (EINVAL);
13981417
}
13991418
} else if (suffix == 'd') {
@@ -1405,6 +1424,42 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
14051424
}
14061425
}
14071426

1427+
if (!children && nfgroup)
1428+
children = width / nfgroup;
1429+
if (!children && nfdomain)
1430+
children = nfdomain;
1431+
if (!children)
1432+
children = width;
1433+
1434+
int fgrps = width / children;
1435+
1436+
if ((nspares % fgrps) != 0) {
1437+
fprintf(stderr, gettext("invalid number of distributed spares "
1438+
"%llu, must be multiple of failure groups %d\n"),
1439+
(u_longlong_t)nspares, fgrps);
1440+
return (EINVAL);
1441+
}
1442+
1443+
if (fgrps == 1 && (nfgroup || nfdomain)) {
1444+
fprintf(stderr, gettext("failure domains are not set "
1445+
"in dRAID vdev descriptor\n"));
1446+
return (EINVAL);
1447+
}
1448+
1449+
if (fgrps > 1 && nfgroup && fgrps != nfgroup) {
1450+
fprintf(stderr, gettext("invalid number of failure groups "
1451+
"%d, must be %d\n"), nfgroup, fgrps);
1452+
return (EINVAL);
1453+
}
1454+
1455+
if (fgrps > 1 && nfdomain && nfdomain != children) {
1456+
fprintf(stderr, gettext("invalid number of failure domains "
1457+
"%d, must be %lu\n"), nfdomain, children);
1458+
return (EINVAL);
1459+
}
1460+
1461+
nspares /= fgrps;
1462+
14081463
/*
14091464
* When a specific number of data disks is not provided limit a
14101465
* redundancy group to 8 data disks. This value was selected to
@@ -1414,8 +1469,8 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
14141469
if (children > nspares + nparity) {
14151470
ndata = MIN(children - nspares - nparity, 8);
14161471
} else {
1417-
fprintf(stderr, gettext("request number of "
1418-
"distributed spares %llu and parity level %llu\n"
1472+
fprintf(stderr, gettext("requested number of "
1473+
"distributed spares %llu and parity level %llu "
14191474
"leaves no disks available for data\n"),
14201475
(u_longlong_t)nspares, (u_longlong_t)nparity);
14211476
return (EINVAL);
@@ -1450,7 +1505,7 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
14501505
(u_longlong_t)(ndata + nparity + nspares));
14511506
}
14521507

1453-
if (children > VDEV_DRAID_MAX_CHILDREN) {
1508+
if (width > VDEV_DRAID_MAX_CHILDREN) {
14541509
fprintf(stderr, gettext("%llu disks were provided, but "
14551510
"dRAID only supports up to %u disks"),
14561511
(u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
@@ -1467,8 +1522,9 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
14671522
/* Store the basic dRAID configuration. */
14681523
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
14691524
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
1470-
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
1525+
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares * fgrps);
14711526
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
1527+
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NCHILDREN, children);
14721528

14731529
return (0);
14741530
}
@@ -1606,10 +1662,41 @@ construct_spec(nvlist_t *props, int argc, char **argv)
16061662
nlogs++;
16071663
}
16081664

1665+
int nfdomain = 0, nfgroup = 0;
1666+
int fdndev = 0, fgndev = 0;
1667+
int fdndev_prev = 0, fgndev_prev = 0;
1668+
16091669
for (c = 1; c < argc; c++) {
16101670
if (is_grouping(argv[c], NULL, NULL) != NULL)
16111671
break;
16121672

1673+
if (strcmp(argv[c], "fgroup") == 0 ||
1674+
strcmp(argv[c], "failure_group") == 0) {
1675+
if (fgndev_prev &&
1676+
fgndev_prev != fgndev)
1677+
break;
1678+
fgndev_prev = fgndev;
1679+
fgndev = 0;
1680+
nfgroup++;
1681+
continue;
1682+
}
1683+
1684+
if (strcmp(argv[c], "fdomain") == 0 ||
1685+
strcmp(argv[c], "failure_domain") == 0) {
1686+
if (fdndev_prev &&
1687+
fdndev_prev != fdndev)
1688+
break;
1689+
fdndev_prev = fdndev;
1690+
fdndev = 0;
1691+
nfdomain++;
1692+
continue;
1693+
}
1694+
1695+
if (nfgroup)
1696+
fgndev++;
1697+
if (nfdomain)
1698+
fdndev++;
1699+
16131700
children++;
16141701
child = realloc(child,
16151702
children * sizeof (nvlist_t *));
@@ -1647,6 +1734,81 @@ construct_spec(nvlist_t *props, int argc, char **argv)
16471734
goto spec_out;
16481735
}
16491736

1737+
if ((nfdomain || nfgroup) &&
1738+
strcmp(type, VDEV_TYPE_DRAID) != 0) {
1739+
(void) fprintf(stderr, gettext("invalid vdev "
1740+
"specification: %s is not dRAID and cannot "
1741+
"have failure domains\n"), argv[0]);
1742+
for (c = 0; c < children; c++)
1743+
nvlist_free(child[c]);
1744+
free(child);
1745+
goto spec_out;
1746+
}
1747+
1748+
if (nfgroup && nfdomain) {
1749+
(void) fprintf(stderr, gettext("invalid vdev "
1750+
"specification: %s has mixed configuration "
1751+
"of %d failure groups and %d failure "
1752+
"domains, it must have either fgroups or "
1753+
"fdomains, not both\n"), argv[0],
1754+
nfgroup, nfdomain);
1755+
for (c = 0; c < children; c++)
1756+
nvlist_free(child[c]);
1757+
free(child);
1758+
goto spec_out;
1759+
}
1760+
1761+
if (nfgroup == 1 || nfdomain == 1) {
1762+
(void) fprintf(stderr, gettext("invalid vdev "
1763+
"specification: %s has only one failure %s "
1764+
"configured, it must be more than one\n"),
1765+
argv[0], nfgroup ? "group" : "domain");
1766+
for (c = 0; c < children; c++)
1767+
nvlist_free(child[c]);
1768+
free(child);
1769+
goto spec_out;
1770+
}
1771+
1772+
if (fgndev_prev != fgndev) {
1773+
(void) fprintf(stderr, gettext("invalid vdev "
1774+
"specification: %s has different number of "
1775+
"devices in failure group %d than in "
1776+
"previous group: %d != %d\n"), argv[0],
1777+
nfgroup, fgndev, fgndev_prev);
1778+
for (c = 0; c < children; c++)
1779+
nvlist_free(child[c]);
1780+
free(child);
1781+
goto spec_out;
1782+
}
1783+
1784+
if (fdndev_prev != fdndev) {
1785+
(void) fprintf(stderr, gettext("invalid vdev "
1786+
"specification: %s has different number of "
1787+
"devices in failure domain %d than in "
1788+
"previous domain: %d != %d\n"), argv[0],
1789+
nfdomain, fdndev, fdndev_prev);
1790+
for (c = 0; c < children; c++)
1791+
nvlist_free(child[c]);
1792+
free(child);
1793+
goto spec_out;
1794+
}
1795+
1796+
if (nfdomain) {
1797+
/* Put children in the right order */
1798+
nvlist_t **ch = NULL;
1799+
ch = realloc(ch,
1800+
children * sizeof (nvlist_t *));
1801+
if (ch == NULL)
1802+
zpool_no_memory();
1803+
int dlen = children / nfdomain;
1804+
int i = 0;
1805+
for (int g = 0; g < dlen; g++)
1806+
for (int d = 0; d < nfdomain; d++)
1807+
ch[i++] = child[g + (d * dlen)];
1808+
free(child);
1809+
child = ch;
1810+
}
1811+
16501812
argc -= c;
16511813
argv += c;
16521814

@@ -1692,7 +1854,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
16921854
}
16931855
if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
16941856
if (draid_config_by_type(nv,
1695-
fulltype, children) != 0) {
1857+
fulltype, children, nfgroup,
1858+
nfdomain) != 0) {
16961859
for (c = 0; c < children; c++)
16971860
nvlist_free(child[c]);
16981861
free(child);

include/libzfs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ typedef enum {
443443
* checksum errors) has been lost.
444444
*/
445445
ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */
446+
ZPOOL_STATUS_FAULTED_FDOM_R, /* faulted fdomain with replicas */
446447
ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */
447448

448449
/*

include/sys/fs/zfs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ typedef enum {
389389
VDEV_PROP_SIT_OUT,
390390
VDEV_PROP_AUTOSIT,
391391
VDEV_PROP_SLOW_IO_EVENTS,
392+
VDEV_PROP_FDOMAIN,
393+
VDEV_PROP_FGROUP,
392394
VDEV_NUM_PROPS
393395
} vdev_prop_t;
394396

@@ -907,6 +909,7 @@ typedef struct zpool_load_policy {
907909
#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata"
908910
#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares"
909911
#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups"
912+
#define ZPOOL_CONFIG_DRAID_NCHILDREN "draid_nchildren"
910913

911914
#define VDEV_TYPE_ROOT "root"
912915
#define VDEV_TYPE_MIRROR "mirror"

include/sys/vdev_draid.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ typedef struct vdev_draid_config {
6868
*/
6969
uint64_t vdc_ndata; /* # of data devices in group */
7070
uint64_t vdc_nparity; /* # of parity devices in group */
71-
uint64_t vdc_nspares; /* # of distributed spares */
71+
uint64_t vdc_nspares; /* # of distributed spares in slice */
7272
uint64_t vdc_children; /* # of children */
7373
uint64_t vdc_ngroups; /* # groups per slice */
74+
uint64_t vdc_width; /* # multiple of children */
7475

7576
/*
7677
* Immutable derived constants.
@@ -103,7 +104,9 @@ extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);
103104
/* Functions for dRAID distributed spares. */
104105
extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t);
105106
extern vdev_t *vdev_draid_spare_get_parent(vdev_t *);
106-
extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t);
107+
extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t *,
108+
uint64_t);
109+
extern boolean_t vdev_draid_fail_domain_allowed(vdev_t *);
107110

108111
#ifdef __cplusplus
109112
}

0 commit comments

Comments
 (0)