aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/cmd
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2023-11-09 10:42:33 +0000
committerMartin Matuska <mm@FreeBSD.org>2023-11-09 12:19:17 +0000
commite716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch)
tree3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/cmd
parentf5b3e686292b6502878c64c3c154908024e06eb6 (diff)
parent887a3c533b94a4b70075e310f15c45b9dee19410 (diff)
zfs: merge openzfs/zfs@887a3c533
Notable upstream pull request merges: #15022 5caeef02f RAID-Z expansion feature #15457 887a3c533 Increase L2ARC write rate and headroom #15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384 Obtained from: OpenZFS OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410
Diffstat (limited to 'sys/contrib/openzfs/cmd')
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c12
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/raidz_test.c196
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/raidz_test.h3
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb.c5
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_main.c148
-rw-r--r--sys/contrib/openzfs/cmd/ztest.c912
6 files changed, 963 insertions, 313 deletions
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
index 8be08558b36d..730e6e1a040b 100644
--- a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
@@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl)
if (rto_opts.rto_expand) {
rm_bench = vdev_raidz_map_alloc_expanded(
- zio_bench.io_abd,
- zio_bench.io_size, zio_bench.io_offset,
+ &zio_bench,
rto_opts.rto_ashift, ncols+1, ncols,
- fn+1, rto_opts.rto_expand_offset);
+ fn+1, rto_opts.rto_expand_offset,
+ 0, B_FALSE);
} else {
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, fn+1);
@@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl)
if (rto_opts.rto_expand) {
rm_bench = vdev_raidz_map_alloc_expanded(
- zio_bench.io_abd,
- zio_bench.io_size, zio_bench.io_offset,
+ &zio_bench,
BENCH_ASHIFT, ncols+1, ncols,
- PARITY_PQR, rto_opts.rto_expand_offset);
+ PARITY_PQR,
+ rto_opts.rto_expand_offset, 0, B_FALSE);
} else {
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, PARITY_PQR);
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
index 195026d3a7ab..6a018ecf0737 100644
--- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
@@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
if (opts->rto_expand) {
opts->rm_golden =
- vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
- opts->zio_golden->io_size, opts->zio_golden->io_offset,
+ vdev_raidz_map_alloc_expanded(opts->zio_golden,
opts->rto_ashift, total_ncols+1, total_ncols,
- parity, opts->rto_expand_offset);
- rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
- zio_test->io_size, zio_test->io_offset,
+ parity, opts->rto_expand_offset, 0, B_FALSE);
+ rm_test = vdev_raidz_map_alloc_expanded(zio_test,
opts->rto_ashift, total_ncols+1, total_ncols,
- parity, opts->rto_expand_offset);
+ parity, opts->rto_expand_offset, 0, B_FALSE);
} else {
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
opts->rto_ashift, total_ncols, parity);
@@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
return (err);
}
-/*
- * If reflow is not in progress, reflow_offset should be UINT64_MAX.
- * For each row, if the row is entirely before reflow_offset, it will
- * come from the new location. Otherwise this row will come from the
- * old location. Therefore, rows that straddle the reflow_offset will
- * come from the old location.
- *
- * NOTE: Until raidz expansion is implemented this function is only
- * needed by raidz_test.c to the multi-row raid_map_t functionality.
- */
-raidz_map_t *
-vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
- uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
- uint64_t nparity, uint64_t reflow_offset)
-{
- /* The zio's size in units of the vdev's minimum sector size. */
- uint64_t s = size >> ashift;
- uint64_t q, r, bc, devidx, asize = 0, tot;
-
- /*
- * "Quotient": The number of data sectors for this stripe on all but
- * the "big column" child vdevs that also contain "remainder" data.
- * AKA "full rows"
- */
- q = s / (logical_cols - nparity);
-
- /*
- * "Remainder": The number of partial stripe data sectors in this I/O.
- * This will add a sector to some, but not all, child vdevs.
- */
- r = s - q * (logical_cols - nparity);
-
- /* The number of "big columns" - those which contain remainder data. */
- bc = (r == 0 ? 0 : r + nparity);
-
- /*
- * The total number of data and parity sectors associated with
- * this I/O.
- */
- tot = s + nparity * (q + (r == 0 ? 0 : 1));
-
- /* How many rows contain data (not skip) */
- uint64_t rows = howmany(tot, logical_cols);
- int cols = MIN(tot, logical_cols);
-
- raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
- KM_SLEEP);
- rm->rm_nrows = rows;
-
- for (uint64_t row = 0; row < rows; row++) {
- raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
- rr_col[cols]), KM_SLEEP);
- rm->rm_row[row] = rr;
-
- /* The starting RAIDZ (parent) vdev sector of the row. */
- uint64_t b = (offset >> ashift) + row * logical_cols;
-
- /*
- * If we are in the middle of a reflow, and any part of this
- * row has not been copied, then use the old location of
- * this row.
- */
- int row_phys_cols = physical_cols;
- if (b + (logical_cols - nparity) > reflow_offset >> ashift)
- row_phys_cols--;
-
- /* starting child of this row */
- uint64_t child_id = b % row_phys_cols;
- /* The starting byte offset on each child vdev. */
- uint64_t child_offset = (b / row_phys_cols) << ashift;
-
- /*
- * We set cols to the entire width of the block, even
- * if this row is shorter. This is needed because parity
- * generation (for Q and R) needs to know the entire width,
- * because it treats the short row as though it was
- * full-width (and the "phantom" sectors were zero-filled).
- *
- * Another approach to this would be to set cols shorter
- * (to just the number of columns that we might do i/o to)
- * and have another mechanism to tell the parity generation
- * about the "entire width". Reconstruction (at least
- * vdev_raidz_reconstruct_general()) would also need to
- * know about the "entire width".
- */
- rr->rr_cols = cols;
- rr->rr_bigcols = bc;
- rr->rr_missingdata = 0;
- rr->rr_missingparity = 0;
- rr->rr_firstdatacol = nparity;
- rr->rr_abd_empty = NULL;
- rr->rr_nempty = 0;
-
- for (int c = 0; c < rr->rr_cols; c++, child_id++) {
- if (child_id >= row_phys_cols) {
- child_id -= row_phys_cols;
- child_offset += 1ULL << ashift;
- }
- rr->rr_col[c].rc_devidx = child_id;
- rr->rr_col[c].rc_offset = child_offset;
- rr->rr_col[c].rc_orig_data = NULL;
- rr->rr_col[c].rc_error = 0;
- rr->rr_col[c].rc_tried = 0;
- rr->rr_col[c].rc_skipped = 0;
- rr->rr_col[c].rc_need_orig_restore = B_FALSE;
-
- uint64_t dc = c - rr->rr_firstdatacol;
- if (c < rr->rr_firstdatacol) {
- rr->rr_col[c].rc_size = 1ULL << ashift;
- rr->rr_col[c].rc_abd =
- abd_alloc_linear(rr->rr_col[c].rc_size,
- B_TRUE);
- } else if (row == rows - 1 && bc != 0 && c >= bc) {
- /*
- * Past the end, this for parity generation.
- */
- rr->rr_col[c].rc_size = 0;
- rr->rr_col[c].rc_abd = NULL;
- } else {
- /*
- * "data column" (col excluding parity)
- * Add an ASCII art diagram here
- */
- uint64_t off;
-
- if (c < bc || r == 0) {
- off = dc * rows + row;
- } else {
- off = r * rows +
- (dc - r) * (rows - 1) + row;
- }
- rr->rr_col[c].rc_size = 1ULL << ashift;
- rr->rr_col[c].rc_abd = abd_get_offset_struct(
- &rr->rr_col[c].rc_abdstruct,
- abd, off << ashift, 1 << ashift);
- }
-
- asize += rr->rr_col[c].rc_size;
- }
- /*
- * If all data stored spans all columns, there's a danger that
- * parity will always be on the same device and, since parity
- * isn't read during normal operation, that that device's I/O
- * bandwidth won't be used effectively. We therefore switch
- * the parity every 1MB.
- *
- * ...at least that was, ostensibly, the theory. As a practical
- * matter unless we juggle the parity between all devices
- * evenly, we won't see any benefit. Further, occasional writes
- * that aren't a multiple of the LCM of the number of children
- * and the minimum stripe width are sufficient to avoid pessimal
- * behavior. Unfortunately, this decision created an implicit
- * on-disk format requirement that we need to support for all
- * eternity, but only for single-parity RAID-Z.
- *
- * If we intend to skip a sector in the zeroth column for
- * padding we must make sure to note this swap. We will never
- * intend to skip the first column since at least one data and
- * one parity column must appear in each row.
- */
- if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
- (offset & (1ULL << 20))) {
- ASSERT(rr->rr_cols >= 2);
- ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
- devidx = rr->rr_col[0].rc_devidx;
- uint64_t o = rr->rr_col[0].rc_offset;
- rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
- rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
- rr->rr_col[1].rc_devidx = devidx;
- rr->rr_col[1].rc_offset = o;
- }
-
- }
- ASSERT3U(asize, ==, tot << ashift);
-
- /* init RAIDZ parity ops */
- rm->rm_ops = vdev_raidz_math_get_ops();
-
- return (rm);
-}
-
static raidz_map_t *
init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
{
@@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
init_zio_abd(*zio);
if (opts->rto_expand) {
- rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
- (*zio)->io_size, (*zio)->io_offset,
+ rm = vdev_raidz_map_alloc_expanded(*zio,
opts->rto_ashift, total_ncols+1, total_ncols,
- parity, opts->rto_expand_offset);
+ parity, opts->rto_expand_offset, 0, B_FALSE);
} else {
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
total_ncols, parity);
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
index 163929defc73..f912e281f6f3 100644
--- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
@@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio);
void run_raidz_benchmark(void);
-struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
- uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
-
#endif /* RAIDZ_TEST_H */
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
index 3c282f3fc975..18221c4b92d2 100644
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -4134,6 +4134,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
}
(void) printf("\tcheckpoint_txg = %llu\n",
(u_longlong_t)ub->ub_checkpoint_txg);
+
+ (void) printf("\traidz_reflow state=%u off=%llu\n",
+ (int)RRSS_GET_STATE(ub),
+ (u_longlong_t)RRSS_GET_OFFSET(ub));
+
(void) printf("%s", footer ? footer : "");
}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
index 5507f9d3fd67..9dd1d2109004 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -6650,9 +6650,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
rebuild);
- if (ret == 0 && wait)
- ret = zpool_wait(zhp,
- replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER);
+ if (ret == 0 && wait) {
+ zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER;
+ char raidz_prefix[] = "raidz";
+ if (replacing) {
+ activity = ZPOOL_WAIT_REPLACE;
+ } else if (strncmp(old_disk,
+ raidz_prefix, strlen(raidz_prefix)) == 0) {
+ activity = ZPOOL_WAIT_RAIDZ_EXPAND;
+ }
+ ret = zpool_wait(zhp, activity);
+ }
nvlist_free(props);
nvlist_free(nvroot);
@@ -6678,17 +6686,21 @@ zpool_do_replace(int argc, char **argv)
}
/*
- * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
+ * zpool attach [-fsw] [-o property=value] <pool> <device>|<vdev> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
* -s Use sequential instead of healing reconstruction for resilver.
* -o Set property=value.
- * -w Wait for resilvering to complete before returning
+ * -w Wait for resilvering (mirror) or expansion (raidz) to complete
+ * before returning.
*
- * Attach <new_device> to the mirror containing <device>. If <device> is not
- * part of a mirror, then <device> will be transformed into a mirror of
- * <device> and <new_device>. In either case, <new_device> will begin life
- * with a DTL of [0, now], and will immediately begin to resilver itself.
+ * Attach <new_device> to a <device> or <vdev>, where the vdev can be of type
+ * mirror or raidz. If <device> is not part of a mirror, then <device> will
+ * be transformed into a mirror of <device> and <new_device>. When a mirror
+ * is involved, <new_device> will begin life with a DTL of [0, now], and will
+ * immediately begin to resilver itself. For the raidz case, a expansion will
+ * commence and reflow the raidz data across all the disks including the
+ * <new_device>.
*/
int
zpool_do_attach(int argc, char **argv)
@@ -8195,6 +8207,97 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
}
}
+/*
+ * Print out detailed raidz expansion status.
+ */
+static void
+print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres)
+{
+ char copied_buf[7];
+
+ if (pres == NULL || pres->pres_state == DSS_NONE)
+ return;
+
+ /*
+ * Determine name of vdev.
+ */
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+ nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ nvlist_t **child;
+ uint_t children;
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+ assert(pres->pres_expanding_vdev < children);
+
+ printf_color(ANSI_BOLD, gettext("expand: "));
+
+ time_t start = pres->pres_start_time;
+ time_t end = pres->pres_end_time;
+ char *vname =
+ zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0);
+ zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf));
+
+ /*
+ * Expansion is finished or canceled.
+ */
+ if (pres->pres_state == DSS_FINISHED) {
+ char time_buf[32];
+ secs_to_dhms(end - start, time_buf);
+
+ (void) printf(gettext("expanded %s-%u copied %s in %s, "
+ "on %s"), vname, (int)pres->pres_expanding_vdev,
+ copied_buf, time_buf, ctime((time_t *)&end));
+ } else {
+ char examined_buf[7], total_buf[7], rate_buf[7];
+ uint64_t copied, total, elapsed, secs_left;
+ double fraction_done;
+ uint_t rate;
+
+ assert(pres->pres_state == DSS_SCANNING);
+
+ /*
+ * Expansion is in progress.
+ */
+ (void) printf(gettext(
+ "expansion of %s-%u in progress since %s"),
+ vname, (int)pres->pres_expanding_vdev, ctime(&start));
+
+ copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1;
+ total = pres->pres_to_reflow;
+ fraction_done = (double)copied / total;
+
+ /* elapsed time for this pass */
+ elapsed = time(NULL) - pres->pres_start_time;
+ elapsed = elapsed > 0 ? elapsed : 1;
+ rate = copied / elapsed;
+ rate = rate > 0 ? rate : 1;
+ secs_left = (total - copied) / rate;
+
+ zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
+ zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+ /*
+ * do not print estimated time if hours_left is more than
+ * 30 days
+ */
+ (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"),
+ examined_buf, total_buf, rate_buf, 100 * fraction_done);
+ if (pres->pres_waiting_for_resilver) {
+ (void) printf(gettext(", paused for resilver or "
+ "clear\n"));
+ } else if (secs_left < (30 * 24 * 3600)) {
+ char time_buf[32];
+ secs_to_dhms(secs_left, time_buf);
+ (void) printf(gettext(", %s to go\n"), time_buf);
+ } else {
+ (void) printf(gettext(
+ ", (copy is slow, no estimated time)\n"));
+ }
+ }
+ free(vname);
+}
static void
print_checkpoint_status(pool_checkpoint_stat_t *pcs)
{
@@ -8772,19 +8875,24 @@ status_callback(zpool_handle_t *zhp, void *data)
uint64_t nerr;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- pool_checkpoint_stat_t *pcs = NULL;
- pool_removal_stat_t *prs = NULL;
print_scan_status(zhp, nvroot);
+ pool_removal_stat_t *prs = NULL;
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
print_removal_status(zhp, prs);
+ pool_checkpoint_stat_t *pcs = NULL;
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
print_checkpoint_status(pcs);
+ pool_raidz_expand_stat_t *pres = NULL;
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c);
+ print_raidz_expand_status(zhp, pres);
+
cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
if (cbp->cb_namewidth < 10)
@@ -10738,8 +10846,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
pool_checkpoint_stat_t *pcs = NULL;
pool_scan_stat_t *pss = NULL;
pool_removal_stat_t *prs = NULL;
+ pool_raidz_expand_stat_t *pres = NULL;
const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE",
- "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"};
+ "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"};
int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];
/* Calculate the width of each column */
@@ -10798,6 +10907,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
vdev_activity_top_remaining(nvroot);
}
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c);
+ if (pres != NULL && pres->pres_state == DSS_SCANNING) {
+ int64_t rem = pres->pres_to_reflow - pres->pres_reflowed;
+ bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem;
+ }
+
bytes_rem[ZPOOL_WAIT_INITIALIZE] =
vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);
bytes_rem[ZPOOL_WAIT_TRIM] =
@@ -10827,11 +10943,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
if (!wd->wd_enabled[i])
continue;
- if (wd->wd_exact)
+ if (wd->wd_exact) {
(void) snprintf(buf, sizeof (buf), "%" PRIi64,
bytes_rem[i]);
- else
+ } else {
zfs_nicenum(bytes_rem[i], buf, sizeof (buf));
+ }
if (wd->wd_scripted)
(void) printf(i == 0 ? "%s" : "\t%s", buf);
@@ -10937,7 +11054,8 @@ zpool_do_wait(int argc, char **argv)
for (char *tok; (tok = strsep(&optarg, ",")); ) {
static const char *const col_opts[] = {
"discard", "free", "initialize", "replace",
- "remove", "resilver", "scrub", "trim" };
+ "remove", "resilver", "scrub", "trim",
+ "raidz_expand" };
for (i = 0; i < ARRAY_SIZE(col_opts); ++i)
if (strcmp(tok, col_opts[i]) == 0) {
diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c
index 8cfbdfe1c2e2..1d414a9f6fd5 100644
--- a/sys/contrib/openzfs/cmd/ztest.c
+++ b/sys/contrib/openzfs/cmd/ztest.c
@@ -151,6 +151,7 @@ typedef struct ztest_shared_hdr {
uint64_t zh_stats_count;
uint64_t zh_ds_size;
uint64_t zh_ds_count;
+ uint64_t zh_scratch_state_size;
} ztest_shared_hdr_t;
static ztest_shared_hdr_t *ztest_shared_hdr;
@@ -161,6 +162,16 @@ enum ztest_class_state {
ZTEST_VDEV_CLASS_RND
};
+/* Dedicated RAIDZ Expansion test states */
+typedef enum {
+ RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */
+ RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */
+ RAIDZ_EXPAND_STARTED, /* Testing has commenced */
+ RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */
+ RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */
+} raidz_expand_test_state_t;
+
+
#define ZO_GVARS_MAX_ARGLEN ((size_t)64)
#define ZO_GVARS_MAX_COUNT ((size_t)10)
@@ -174,6 +185,7 @@ typedef struct ztest_shared_opts {
size_t zo_vdev_size;
int zo_ashift;
int zo_mirrors;
+ int zo_raid_do_expand;
int zo_raid_children;
int zo_raid_parity;
char zo_raid_type[8];
@@ -188,6 +200,7 @@ typedef struct ztest_shared_opts {
uint64_t zo_time;
uint64_t zo_maxloops;
uint64_t zo_metaslab_force_ganging;
+ raidz_expand_test_state_t zo_raidz_expand_test;
int zo_mmp_test;
int zo_special_vdevs;
int zo_dump_dbgmsg;
@@ -249,6 +262,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING,
.zo_special_vdevs = ZTEST_VDEV_CLASS_RND,
.zo_gvars_count = 0,
+ .zo_raidz_expand_test = RAIDZ_EXPAND_NONE,
};
extern uint64_t metaslab_force_ganging;
@@ -261,6 +275,8 @@ extern uint_t dmu_object_alloc_chunk_shift;
extern boolean_t zfs_force_some_double_word_sm_entries;
extern unsigned long zio_decompress_fail_fraction;
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
+extern uint64_t raidz_expand_max_reflow_bytes;
+extern uint_t raidz_expand_pause_point;
static ztest_shared_opts_t *ztest_shared_opts;
@@ -274,6 +290,12 @@ typedef struct ztest_shared_ds {
static ztest_shared_ds_t *ztest_shared_ds;
#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
+typedef struct ztest_scratch_state {
+ uint64_t zs_raidz_scratch_verify_pause;
+} ztest_shared_scratch_state_t;
+
+static ztest_shared_scratch_state_t *ztest_scratch_state;
+
#define BT_MAGIC 0x123456789abcdefULL
#define MAXFAULTS(zs) \
(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1)
@@ -311,9 +333,9 @@ typedef struct bufwad {
* still need to map from object ID to rangelock_t.
*/
typedef enum {
- RL_READER,
- RL_WRITER,
- RL_APPEND
+ ZTRL_READER,
+ ZTRL_WRITER,
+ ZTRL_APPEND
} rl_type_t;
typedef struct rll {
@@ -408,6 +430,7 @@ ztest_func_t ztest_mmp_enable_disable;
ztest_func_t ztest_scrub;
ztest_func_t ztest_dsl_dataset_promote_busy;
ztest_func_t ztest_vdev_attach_detach;
+ztest_func_t ztest_vdev_raidz_attach;
ztest_func_t ztest_vdev_LUN_growth;
ztest_func_t ztest_vdev_add_remove;
ztest_func_t ztest_vdev_class_add;
@@ -465,6 +488,7 @@ static ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely),
ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely),
ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes),
ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),
ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime),
@@ -745,7 +769,7 @@ static ztest_option_t option_table[] = {
DEFAULT_RAID_CHILDREN, NULL},
{ 'R', "raid-parity", "INTEGER", "Raid parity",
DEFAULT_RAID_PARITY, NULL},
- { 'K', "raid-kind", "raidz|draid|random", "Raid kind",
+ { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind",
NO_DEFAULT, "random"},
{ 'D', "draid-data", "INTEGER", "Number of draid data drives",
DEFAULT_DRAID_DATA, NULL},
@@ -781,6 +805,9 @@ static ztest_option_t option_table[] = {
NO_DEFAULT, NULL},
{ 'C', "vdev-class-state", "on|off|random", "vdev class state",
NO_DEFAULT, "random"},
+ { 'X', "raidz-expansion", NULL,
+ "Perform a dedicated raidz expansion test",
+ NO_DEFAULT, NULL},
{ 'o', "option", "\"OPTION=INTEGER\"",
"Set global variable to an unsigned 32-bit integer value",
NO_DEFAULT, NULL},
@@ -853,7 +880,7 @@ usage(boolean_t requested)
option_table[i].short_opt,
option_table[i].long_opt);
}
- (void) fprintf(fp, " %-40s%s", option,
+ (void) fprintf(fp, " %-43s%s", option,
option_table[i].comment);
if (option_table[i].long_opt_param != NULL) {
@@ -1027,6 +1054,9 @@ process_options(int argc, char **argv)
case 'V':
zo->zo_verbose++;
break;
+ case 'X':
+ zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED;
+ break;
case 'E':
zo->zo_init = 0;
break;
@@ -1078,9 +1108,28 @@ process_options(int argc, char **argv)
fini_options();
- /* When raid choice is 'random' add a draid pool 50% of the time */
+ /* Force compatible options for raidz expansion run */
+ if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) {
+ zo->zo_mmp_test = 0;
+ zo->zo_mirrors = 0;
+ zo->zo_vdevs = 1;
+ zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2;
+ zo->zo_raid_do_expand = B_FALSE;
+ raid_kind = "raidz";
+ }
+
if (strcmp(raid_kind, "random") == 0) {
- raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz";
+ switch (ztest_random(3)) {
+ case 0:
+ raid_kind = "raidz";
+ break;
+ case 1:
+ raid_kind = "eraidz";
+ break;
+ case 2:
+ raid_kind = "draid";
+ break;
+ }
if (ztest_opts.zo_verbose >= 3)
(void) printf("choosing RAID type '%s'\n", raid_kind);
@@ -1119,6 +1168,18 @@ process_options(int argc, char **argv)
(void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID,
sizeof (zo->zo_raid_type));
+ } else if (strcmp(raid_kind, "eraidz") == 0) {
+ /* using eraidz (expandable raidz) */
+ zo->zo_raid_do_expand = B_TRUE;
+
+ /* tests expect top-level to be raidz */
+ zo->zo_mirrors = 0;
+ zo->zo_vdevs = 1;
+
+ /* Make sure parity is less than data columns */
+ zo->zo_raid_parity = MIN(zo->zo_raid_parity,
+ zo->zo_raid_children - 1);
+
} else /* using raidz */ {
ASSERT0(strcmp(raid_kind, "raidz"));
@@ -1166,9 +1227,29 @@ ztest_kill(ztest_shared_t *zs)
* Before we kill ourselves, make sure that the config is updated.
* See comment above spa_write_cachefile().
*/
- mutex_enter(&spa_namespace_lock);
- spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE);
- mutex_exit(&spa_namespace_lock);
+ if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) {
+ if (mutex_tryenter(&spa_namespace_lock)) {
+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE,
+ B_FALSE);
+ mutex_exit(&spa_namespace_lock);
+
+ ztest_scratch_state->zs_raidz_scratch_verify_pause =
+ raidz_expand_pause_point;
+ } else {
+ /*
+ * Do not verify scratch object in case if
+ * spa_namespace_lock cannot be acquired,
+ * it can cause deadlock in spa_config_update().
+ */
+ raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE;
+
+ return;
+ }
+ } else {
+ mutex_enter(&spa_namespace_lock);
+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE);
+ mutex_exit(&spa_namespace_lock);
+ }
(void) raise(SIGKILL);
}
@@ -1615,7 +1696,7 @@ ztest_rll_lock(rll_t *rll, rl_type_t type)
{
mutex_enter(&rll->rll_lock);
- if (type == RL_READER) {
+ if (type == ZTRL_READER) {
while (rll->rll_writer != NULL)
(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
rll->rll_readers++;
@@ -2071,7 +2152,7 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
ASSERT3U(object, !=, 0);
- ztest_object_lock(zd, object, RL_WRITER);
+ ztest_object_lock(zd, object, ZTRL_WRITER);
VERIFY0(dmu_object_info(os, object, &doi));
@@ -2141,8 +2222,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
if (bt->bt_magic != BT_MAGIC)
bt = NULL;
- ztest_object_lock(zd, lr->lr_foid, RL_READER);
- rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+ ztest_object_lock(zd, lr->lr_foid, ZTRL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER);
VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
@@ -2245,9 +2326,9 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ ztest_object_lock(zd, lr->lr_foid, ZTRL_READER);
rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
- RL_WRITER);
+ ZTRL_WRITER);
tx = dmu_tx_create(os);
@@ -2287,7 +2368,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+ ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER);
VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
@@ -2414,7 +2495,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
ASSERT3P(lwb, !=, NULL);
ASSERT3U(size, !=, 0);
- ztest_object_lock(zd, object, RL_READER);
+ ztest_object_lock(zd, object, ZTRL_READER);
error = dmu_bonus_hold(os, object, FTAG, &db);
if (error) {
ztest_object_unlock(zd, object);
@@ -2439,7 +2520,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
if (buf != NULL) { /* immediate write */
zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,
- object, offset, size, RL_READER);
+ object, offset, size, ZTRL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
@@ -2455,7 +2536,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
}
zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,
- object, offset, size, RL_READER);
+ object, offset, size, ZTRL_READER);
error = dmu_buf_hold_noread(os, object, offset, zgd, &db);
@@ -2531,7 +2612,7 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
ASSERT3U(od->od_object, !=, 0);
ASSERT0(missing); /* there should be no gaps */
- ztest_object_lock(zd, od->od_object, RL_READER);
+ ztest_object_lock(zd, od->od_object, ZTRL_READER);
VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object,
FTAG, &db));
dmu_object_info_from_db(db, &doi);
@@ -2634,7 +2715,7 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
static int
ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
- void *data)
+ const void *data)
{
lr_write_t *lr;
int error;
@@ -2704,8 +2785,8 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
txg_wait_synced(dmu_objset_pool(os), 0);
- ztest_object_lock(zd, object, RL_READER);
- rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+ ztest_object_lock(zd, object, ZTRL_READER);
+ rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER);
tx = dmu_tx_create(os);
@@ -3033,13 +3114,32 @@ ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id)
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
+static int
+ztest_get_raidz_children(spa_t *spa)
+{
+ (void) spa;
+ vdev_t *raidvd;
+
+ ASSERT(MUTEX_HELD(&ztest_vdev_lock));
+
+ if (ztest_opts.zo_raid_do_expand) {
+ raidvd = ztest_spa->spa_root_vdev->vdev_child[0];
+
+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+ return (raidvd->vdev_children);
+ }
+
+ return (ztest_opts.zo_raid_children);
+}
+
void
ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
{
(void) zd, (void) id;
spa_t *spa;
uint64_t initial_version = SPA_VERSION_INITIAL;
- uint64_t version, newversion;
+ uint64_t raidz_children, version, newversion;
nvlist_t *nvroot, *props;
char *name;
@@ -3058,8 +3158,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
*/
(void) spa_destroy(name);
+ raidz_children = ztest_get_raidz_children(ztest_spa);
+
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
- NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1);
+ NULL, raidz_children, ztest_opts.zo_mirrors, 1);
/*
* If we're configuring a RAIDZ device then make sure that the
@@ -3125,6 +3227,7 @@ ztest_spa_checkpoint(spa_t *spa)
case ZFS_ERR_DEVRM_IN_PROGRESS:
case ZFS_ERR_DISCARDING_CHECKPOINT:
case ZFS_ERR_CHECKPOINT_EXISTS:
+ case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS:
break;
case ENOSPC:
ztest_record_enospc(FTAG);
@@ -3205,6 +3308,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
spa_t *spa = ztest_spa;
uint64_t leaves;
uint64_t guid;
+ uint64_t raidz_children;
+
nvlist_t *nvroot;
int error;
@@ -3212,8 +3317,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
return;
mutex_enter(&ztest_vdev_lock);
- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
- ztest_opts.zo_raid_children;
+ raidz_children = ztest_get_raidz_children(spa);
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -3267,7 +3372,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
*/
nvroot = make_vdev_root(NULL, NULL, NULL,
ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
- "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors,
+ "log" : NULL, raidz_children, zs->zs_mirrors,
1);
error = spa_vdev_add(spa, nvroot);
@@ -3295,6 +3400,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
spa_t *spa = ztest_spa;
uint64_t leaves;
nvlist_t *nvroot;
+ uint64_t raidz_children;
const char *class = (ztest_random(2) == 0) ?
VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;
int error;
@@ -3322,15 +3428,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
return;
}
- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
- ztest_opts.zo_raid_children;
+ raidz_children = ztest_get_raidz_children(spa);
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
spa_config_exit(spa, SCL_VDEV, FTAG);
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
- class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
+ class, raidz_children, zs->zs_mirrors, 1);
error = spa_vdev_add(spa, nvroot);
fnvlist_free(nvroot);
@@ -3592,6 +3698,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
uint64_t ashift = ztest_get_ashift();
uint64_t oldguid, pguid;
uint64_t oldsize, newsize;
+ uint64_t raidz_children;
char *oldpath, *newpath;
int replacing;
int oldvd_has_siblings = B_FALSE;
@@ -3608,7 +3715,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
mutex_enter(&ztest_vdev_lock);
- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
+ raidz_children = ztest_get_raidz_children(spa);
+ leaves = MAX(zs->zs_mirrors, 1) * raidz_children;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
@@ -3624,6 +3732,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
}
/*
+ * RAIDZ leaf VDEV mirrors are not currently supported while a
+ * RAIDZ expansion is in progress.
+ */
+ if (ztest_opts.zo_raid_do_expand) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ goto out;
+ }
+
+ /*
* Decide whether to do an attach or a replace.
*/
replacing = ztest_random(2);
@@ -3647,7 +3764,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (zs->zs_mirrors >= 1) {
ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops);
ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors);
- oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children];
+ oldvd = oldvd->vdev_child[leaf / raidz_children];
}
/* pick a child out of the raidz group */
@@ -3656,8 +3773,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops);
else
ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops);
- ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children);
- oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children];
+ oldvd = oldvd->vdev_child[leaf % raidz_children];
}
/*
@@ -3825,6 +3941,226 @@ out:
umem_free(newpath, MAXPATHLEN);
}
+static void
+raidz_scratch_verify(void)
+{
+ spa_t *spa;
+ uint64_t write_size, logical_size, offset;
+ raidz_reflow_scratch_state_t state;
+ vdev_raidz_expand_t *vre;
+ vdev_t *raidvd;
+
+ ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE);
+
+ if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0)
+ return;
+
+ kernel_init(SPA_MODE_READ);
+
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(ztest_opts.zo_pool);
+ ASSERT(spa);
+ spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
+ mutex_exit(&spa_namespace_lock);
+
+ VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+
+ ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX);
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
+
+ vre = spa->spa_raidz_expand;
+ if (vre == NULL)
+ goto out;
+
+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
+ state = RRSS_GET_STATE(&spa->spa_uberblock);
+ write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift);
+ logical_size = write_size * raidvd->vdev_children;
+
+ switch (state) {
+ /*
+ * Initial state of reflow process. RAIDZ expansion was
+ * requested by user, but scratch object was not created.
+ */
+ case RRSS_SCRATCH_NOT_IN_USE:
+ ASSERT3U(offset, ==, 0);
+ break;
+
+ /*
+ * Scratch object was synced and stored in boot area.
+ */
+ case RRSS_SCRATCH_VALID:
+
+ /*
+ * Scratch object was synced back to raidz start offset,
+ * raidz is ready for sector by sector reflow process.
+ */
+ case RRSS_SCRATCH_INVALID_SYNCED:
+
+ /*
+ * Scratch object was synced back to raidz start offset
+ * on zpool importing, raidz is ready for sector by sector
+ * reflow process.
+ */
+ case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT:
+ ASSERT3U(offset, ==, logical_size);
+ break;
+
+ /*
+ * Sector by sector reflow process started.
+ */
+ case RRSS_SCRATCH_INVALID_SYNCED_REFLOW:
+ ASSERT3U(offset, >=, logical_size);
+ break;
+ }
+
+out:
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ mutex_exit(&ztest_vdev_lock);
+
+ ztest_scratch_state->zs_raidz_scratch_verify_pause = 0;
+
+ spa_close(spa, FTAG);
+ kernel_fini();
+}
+
+static void
+ztest_scratch_thread(void *arg)
+{
+ (void) arg;
+
+ /* wait up to 10 seconds */
+ for (int t = 100; t > 0; t -= 1) {
+ if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE)
+ thread_exit();
+
+ (void) poll(NULL, 0, 100);
+ }
+
+ /* killed when the scratch area progress reached a certain point */
+ ztest_kill(ztest_shared);
+}
+
+/*
+ * Verify that we can attach raidz device.
+ */
+void
+ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id)
+{
+ (void) zd, (void) id;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift();
+ kthread_t *scratch_thread = NULL;
+ vdev_t *newvd, *pvd;
+ nvlist_t *root;
+ char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ int error, expected_error = 0;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
+
+ /* Only allow attach when raid-kind = 'eraidz' */
+ if (!ztest_opts.zo_raid_do_expand) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ goto out;
+ }
+
+ if (ztest_opts.zo_mmp_test) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ goto out;
+ }
+
+ if (ztest_device_removal_active) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ goto out;
+ }
+
+ pvd = vdev_lookup_top(spa, 0);
+
+ ASSERT(pvd->vdev_ops == &vdev_raidz_ops);
+
+ /*
+ * Get size of a child of the raidz group,
+ * make sure device is a bit bigger
+ */
+ newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)];
+ newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2));
+
+ /*
+ * Get next attached leaf id
+ */
+ raidz_children = ztest_get_raidz_children(spa);
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children;
+ zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
+
+ if (spa->spa_raidz_expand)
+ expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS;
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Path to vdev to be attached
+ */
+ (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
+ ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf);
+
+ /*
+ * Build the nvlist describing newpath.
+ */
+ root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL,
+ 0, 0, 1);
+
+ /*
+ * 50% of the time, set raidz_expand_pause_point to cause
+ * raidz_reflow_scratch_sync() to pause at a certain point and
+ * then kill the test after 10 seconds so raidz_scratch_verify()
+ * can confirm consistency when the pool is imported.
+ */
+ if (ztest_random(2) == 0 && expected_error == 0) {
+ raidz_expand_pause_point =
+ ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1;
+ scratch_thread = thread_create(NULL, 0, ztest_scratch_thread,
+ ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+ }
+
+ error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE);
+
+ nvlist_free(root);
+
+ if (error == EOVERFLOW || error == ENXIO ||
+ error == ZFS_ERR_CHECKPOINT_EXISTS ||
+ error == ZFS_ERR_DISCARDING_CHECKPOINT)
+ expected_error = error;
+
+ if (error != 0 && error != expected_error) {
+ fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d",
+ newpath, newsize, error, expected_error);
+ }
+
+ if (raidz_expand_pause_point) {
+ if (error != 0) {
+ /*
+ * Do not verify scratch object in case of error
+ * returned by vdev attaching.
+ */
+ raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE;
+ }
+
+ VERIFY0(thread_join(scratch_thread));
+ }
+out:
+ mutex_exit(&ztest_vdev_lock);
+
+ umem_free(newpath, MAXPATHLEN);
+}
+
void
ztest_device_removal(ztest_ds_t *zd, uint64_t id)
{
@@ -4031,6 +4367,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
return;
}
+ /*
+ * If we are under raidz expansion, the test can failed because the
+ * metaslabs count will not increase immediately after the vdev is
+ * expanded. It will happen only after raidz expansion completion.
+ */
+ if (spa->spa_raidz_expand) {
+ spa_config_exit(spa, SCL_STATE, spa);
+ mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
+ return;
+ }
+
top = ztest_random_vdev_top(spa, B_TRUE);
tvd = spa->spa_root_vdev->vdev_child[top];
@@ -5815,7 +6163,7 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)
dmu_object_info_t doi;
dmu_buf_t *db;
- ztest_object_lock(zd, obj, RL_READER);
+ ztest_object_lock(zd, obj, ZTRL_READER);
if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) {
ztest_object_unlock(zd, obj);
continue;
@@ -6038,6 +6386,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
uint64_t leaves;
uint64_t bad = 0x1990c0ffeedecadeull;
uint64_t top, leaf;
+ uint64_t raidz_children;
char *path0;
char *pathrand;
size_t fsize;
@@ -6048,6 +6397,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
vdev_t *vd0 = NULL;
uint64_t guid0 = 0;
boolean_t islog = B_FALSE;
+ boolean_t injected = B_FALSE;
path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
@@ -6060,15 +6410,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
* strategy for damaging blocks does not take in to account evacuated
* blocks which may have already been damaged.
*/
- if (ztest_device_removal_active) {
- mutex_exit(&ztest_vdev_lock);
+ if (ztest_device_removal_active)
+ goto out;
+
+ /*
+ * The fault injection strategy for damaging blocks cannot be used
+ * if raidz expansion is in progress. The leaves value
+ * (attached raidz children) is variable and strategy for damaging
+ * blocks will corrupt same data blocks on different child vdevs
+ * because of the reflow process.
+ */
+ if (spa->spa_raidz_expand != NULL)
goto out;
- }
maxfaults = MAXFAULTS(zs);
- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
+ raidz_children = ztest_get_raidz_children(spa);
+ leaves = MAX(zs->zs_mirrors, 1) * raidz_children;
mirror_save = zs->zs_mirrors;
- mutex_exit(&ztest_vdev_lock);
ASSERT3U(leaves, >=, 1);
@@ -6209,13 +6567,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
* call vdev_[on|off]line without holding locks
* to force unpredictable failures but the side
* effects of vdev_[on|off]line prevent us from
- * doing so. We grab the ztest_vdev_lock here to
- * prevent a race between injection testing and
- * aux_vdev removal.
+ * doing so.
*/
- mutex_enter(&ztest_vdev_lock);
(void) vdev_online(spa, guid0, 0, NULL);
- mutex_exit(&ztest_vdev_lock);
}
}
@@ -6289,9 +6643,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
continue;
- mutex_enter(&ztest_vdev_lock);
if (mirror_save != zs->zs_mirrors) {
- mutex_exit(&ztest_vdev_lock);
(void) close(fd);
goto out;
}
@@ -6301,15 +6653,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
"can't inject bad word at 0x%"PRIx64" in %s",
offset, pathrand);
- mutex_exit(&ztest_vdev_lock);
-
if (ztest_opts.zo_verbose >= 7)
(void) printf("injected bad word into %s,"
" offset 0x%"PRIx64"\n", pathrand, offset);
+
+ injected = B_TRUE;
}
(void) close(fd);
out:
+ mutex_exit(&ztest_vdev_lock);
+
+ if (injected && ztest_opts.zo_raid_do_expand) {
+ int error = spa_scan(spa, POOL_SCAN_SCRUB);
+ if (error == 0) {
+ while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+ }
+
umem_free(path0, MAXPATHLEN);
umem_free(pathrand, MAXPATHLEN);
}
@@ -7194,6 +7556,75 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id)
(double)functime / NANOSEC, zi->zi_funcname);
}
+typedef struct ztest_raidz_expand_io {
+ uint64_t rzx_id;
+ uint64_t rzx_amount;
+ uint64_t rzx_bufsize;
+ const void *rzx_buffer;
+ uint64_t rzx_alloc_max;
+ spa_t *rzx_spa;
+} ztest_expand_io_t;
+
+#undef OD_ARRAY_SIZE
+#define OD_ARRAY_SIZE 10
+
+/*
+ * Write a request amount of data to some dataset objects.
+ * There will be ztest_opts.zo_threads count of these running in parallel.
+ */
+static __attribute__((noreturn)) void
+ztest_rzx_thread(void *arg)
+{
+ ztest_expand_io_t *info = (ztest_expand_io_t *)arg;
+ ztest_od_t *od;
+ int batchsize;
+ int od_size;
+ ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets];
+ spa_t *spa = info->rzx_spa;
+
+ od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+ od = umem_alloc(od_size, UMEM_NOFAIL);
+ batchsize = OD_ARRAY_SIZE;
+
+ /* Create objects to write to */
+ for (int b = 0; b < batchsize; b++) {
+ ztest_od_init(od + b, info->rzx_id, FTAG, b,
+ DMU_OT_UINT64_OTHER, 0, 0, 0);
+ }
+ if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) {
+ umem_free(od, od_size);
+ thread_exit();
+ }
+
+ for (uint64_t offset = 0, written = 0; written < info->rzx_amount;
+ offset += info->rzx_bufsize) {
+ /* write to 10 objects */
+ for (int i = 0; i < batchsize && written < info->rzx_amount;
+ i++) {
+ (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock);
+ ztest_write(zd, od[i].od_object, offset,
+ info->rzx_bufsize, info->rzx_buffer);
+ (void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+ written += info->rzx_bufsize;
+ }
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ /* due to inflation, we'll typically bail here */
+ if (metaslab_class_get_alloc(spa_normal_class(spa)) >
+ info->rzx_alloc_max) {
+ break;
+ }
+ }
+
+ /* Remove a few objects to leave some holes in allocation space */
+ mutex_enter(&zd->zd_dirobj_lock);
+ (void) ztest_remove(zd, od, 2);
+ mutex_exit(&zd->zd_dirobj_lock);
+
+ umem_free(od, od_size);
+
+ thread_exit();
+}
+
static __attribute__((noreturn)) void
ztest_thread(void *arg)
{
@@ -7209,8 +7640,10 @@ ztest_thread(void *arg)
/*
* See if it's time to force a crash.
*/
- if (now > zs->zs_thread_kill)
+ if (now > zs->zs_thread_kill &&
+ raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) {
ztest_kill(zs);
+ }
/*
* If we're getting ENOSPC with some regularity, stop.
@@ -7400,9 +7833,14 @@ ztest_freeze(void)
spa_t *spa;
int numloops = 0;
+ /* freeze not supported during RAIDZ expansion */
+ if (ztest_opts.zo_raid_do_expand)
+ return;
+
if (ztest_opts.zo_verbose >= 3)
(void) printf("testing spa_freeze()...\n");
+ raidz_scratch_verify();
kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
VERIFY0(ztest_dataset_open(0));
@@ -7470,6 +7908,7 @@ ztest_freeze(void)
/*
* Open and close the pool and dataset to induce log replay.
*/
+ raidz_scratch_verify();
kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX);
@@ -7519,6 +7958,7 @@ ztest_import(ztest_shared_t *zs)
mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+ raidz_scratch_verify();
kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
ztest_import_impl();
@@ -7543,7 +7983,291 @@ ztest_import(ztest_shared_t *zs)
}
/*
- * Kick off threads to run tests on all datasets in parallel.
+ * After the expansion was killed, check that the pool is healthy
+ */
+static void
+ztest_raidz_expand_check(spa_t *spa)
+{
+ ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED);
+ /*
+ * Set pool check done flag, main program will run a zdb check
+ * of the pool when we exit.
+ */
+ ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED;
+
+ /* Wait for reflow to finish */
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("\nwaiting for reflow to finish ...\n");
+ }
+ pool_raidz_expand_stat_t rzx_stats;
+ pool_raidz_expand_stat_t *pres = &rzx_stats;
+ do {
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 500); /* wait 1/2 second */
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ (void) spa_raidz_expand_get_stats(spa, pres);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ } while (pres->pres_state != DSS_FINISHED &&
+ pres->pres_reflowed < pres->pres_to_reflow);
+
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("verifying an interrupted raidz "
+ "expansion using a pool scrub ...\n");
+ }
+ /* Will fail here if there is non-recoverable corruption detected */
+ VERIFY0(ztest_scrub_impl(spa));
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("raidz expansion scrub check complete\n");
+ }
+}
+
+/*
+ * Start a raidz expansion test. We run some I/O on the pool for a while
+ * to get some data in the pool. Then we grow the raidz and
+ * kill the test at the requested offset into the reflow, verifying that
+ * doing such does not lead to pool corruption.
+ */
+static void
+ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa)
+{
+ nvlist_t *root;
+ pool_raidz_expand_stat_t rzx_stats;
+ pool_raidz_expand_stat_t *pres = &rzx_stats;
+ kthread_t **run_threads;
+ vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0];
+ int total_disks = rzvd->vdev_children;
+ int data_disks = total_disks - vdev_get_nparity(rzvd);
+ uint64_t alloc_goal;
+ uint64_t csize;
+ int error, t;
+ int threads = ztest_opts.zo_threads;
+ ztest_expand_io_t *thread_args;
+
+ ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE);
+ ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops);
+ ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED;
+
+ /* Setup a 1 MiB buffer of random data */
+ uint64_t bufsize = 1024 * 1024;
+ void *buffer = umem_alloc(bufsize, UMEM_NOFAIL);
+
+ if (read(ztest_fd_rand, buffer, bufsize) != bufsize) {
+ fatal(B_TRUE, "short read from /dev/urandom");
+ }
+ /*
+ * Put some data in the pool and then attach a vdev to initiate
+ * reflow.
+ */
+ run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL);
+ thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t),
+ UMEM_NOFAIL);
+ /* Aim for roughly 25% of allocatable space up to 1GB */
+ alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks;
+ alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024);
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("adding data to pool '%s', goal %llu bytes\n",
+ ztest_opts.zo_pool, (u_longlong_t)alloc_goal);
+ }
+
+ /*
+ * Kick off all the I/O generators that run in parallel.
+ */
+ for (t = 0; t < threads; t++) {
+ if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
+ umem_free(run_threads, threads * sizeof (kthread_t *));
+ umem_free(buffer, bufsize);
+ return;
+ }
+ thread_args[t].rzx_id = t;
+ thread_args[t].rzx_amount = alloc_goal / threads;
+ thread_args[t].rzx_bufsize = bufsize;
+ thread_args[t].rzx_buffer = buffer;
+ thread_args[t].rzx_alloc_max = alloc_goal;
+ thread_args[t].rzx_spa = spa;
+ run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread,
+ &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE,
+ defclsyspri);
+ }
+
+ /*
+ * Wait for all of the writers to complete.
+ */
+ for (t = 0; t < threads; t++)
+ VERIFY0(thread_join(run_threads[t]));
+
+ /*
+ * Close all datasets. This must be done after all the threads
+ * are joined so we can be sure none of the datasets are in-use
+ * by any of the threads.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++) {
+ if (t < ztest_opts.zo_datasets)
+ ztest_dataset_close(t);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ umem_free(buffer, bufsize);
+ umem_free(run_threads, threads * sizeof (kthread_t *));
+ umem_free(thread_args, threads * sizeof (ztest_expand_io_t));
+
+ /* Set our reflow target to 25%, 50% or 75% of allocated size */
+ uint_t multiple = ztest_random(3) + 1;
+ uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4;
+ raidz_expand_max_reflow_bytes = reflow_max;
+
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("running raidz expansion test, killing when "
+ "reflow reaches %llu bytes (%u/4 of allocated space)\n",
+ (u_longlong_t)reflow_max, multiple);
+ }
+
+ /* XXX - do we want some I/O load during the reflow? */
+
+ /*
+ * Use a disk size that is larger than existing ones
+ */
+ cvd = rzvd->vdev_child[0];
+ csize = vdev_get_min_asize(cvd);
+ csize += csize / 10;
+ /*
+ * Path to vdev to be attached
+ */
+ char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
+ ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children);
+ /*
+ * Build the nvlist describing newpath.
+ */
+ root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(),
+ NULL, 0, 0, 1);
+ /*
+ * Expand the raidz vdev by attaching the new disk
+ */
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("expanding raidz: %d wide to %d wide with '%s'\n",
+ (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1,
+ newpath);
+ }
+ error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE);
+ nvlist_free(root);
+ if (error != 0) {
+ fatal(0, "raidz expand: attach (%s %llu) returned %d",
+ newpath, (long long)csize, error);
+ }
+
+ /*
+ * Wait for reflow to begin
+ */
+ while (spa->spa_raidz_expand == NULL) {
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 100); /* wait 1/10 second */
+ }
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ (void) spa_raidz_expand_get_stats(spa, pres);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ while (pres->pres_state != DSS_SCANNING) {
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 100); /* wait 1/10 second */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ (void) spa_raidz_expand_get_stats(spa, pres);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ ASSERT3U(pres->pres_state, ==, DSS_SCANNING);
+ ASSERT3U(pres->pres_to_reflow, !=, 0);
+ /*
+ * Set so when we are killed we go to raidz checking rather than
+ * restarting test.
+ */
+ ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED;
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("raidz expansion reflow started, waiting for "
+ "%llu bytes to be copied\n", (u_longlong_t)reflow_max);
+ }
+
+ /*
+ * Wait for reflow maximum to be reached and then kill the test
+ */
+ while (pres->pres_reflowed < reflow_max) {
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 100); /* wait 1/10 second */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ (void) spa_raidz_expand_get_stats(spa, pres);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ /* Reset the reflow pause before killing */
+ raidz_expand_max_reflow_bytes = 0;
+
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("killing raidz expansion test after reflow "
+ "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed);
+ }
+
+ /*
+ * Kill ourself to simulate a panic during a reflow. Our parent will
+ * restart the test and the changed flag value will drive the test
+ * through the scrub/check code to verify the pool is not corrupted.
+ */
+ ztest_kill(zs);
+}
+
+static void
+ztest_generic_run(ztest_shared_t *zs, spa_t *spa)
+{
+ kthread_t **run_threads;
+ int t;
+
+ run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
+ UMEM_NOFAIL);
+
+ /*
+ * Kick off all the tests that run in parallel.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++) {
+ if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
+ umem_free(run_threads, ztest_opts.zo_threads *
+ sizeof (kthread_t *));
+ return;
+ }
+
+ run_threads[t] = thread_create(NULL, 0, ztest_thread,
+ (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
+ defclsyspri);
+ }
+
+ /*
+ * Wait for all of the tests to complete.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++)
+ VERIFY0(thread_join(run_threads[t]));
+
+ /*
+ * Close all datasets. This must be done after all the threads
+ * are joined so we can be sure none of the datasets are in-use
+ * by any of the threads.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++) {
+ if (t < ztest_opts.zo_datasets)
+ ztest_dataset_close(t);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
+}
+
+/*
+ * Setup our test context and kick off threads to run tests on all datasets
+ * in parallel.
*/
static void
ztest_run(ztest_shared_t *zs)
@@ -7551,7 +8275,6 @@ ztest_run(ztest_shared_t *zs)
spa_t *spa;
objset_t *os;
kthread_t *resume_thread, *deadman_thread;
- kthread_t **run_threads;
uint64_t object;
int error;
int t, d;
@@ -7584,6 +8307,7 @@ ztest_run(ztest_shared_t *zs)
* Open our pool. It may need to be imported first depending on
* what tests were running when the previous pass was terminated.
*/
+ raidz_scratch_verify();
kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
error = spa_open(ztest_opts.zo_pool, &spa, FTAG);
if (error) {
@@ -7597,7 +8321,11 @@ ztest_run(ztest_shared_t *zs)
metaslab_preload_limit = ztest_random(20) + 1;
ztest_spa = spa;
- VERIFY0(vdev_raidz_impl_set("cycle"));
+ /*
+ * XXX - BUGBUG raidz expansion do not run this for generic for now
+ */
+ if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE)
+ VERIFY0(vdev_raidz_impl_set("cycle"));
dmu_objset_stats_t dds;
VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
@@ -7607,6 +8335,10 @@ ztest_run(ztest_shared_t *zs)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
dmu_objset_disown(os, B_TRUE, FTAG);
+ /* Give the dedicated raidz expansion test more grace time */
+ if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE)
+ zfs_deadman_synctime_ms *= 2;
+
/*
* Create a thread to periodically resume suspended I/O.
*/
@@ -7640,6 +8372,10 @@ ztest_run(ztest_shared_t *zs)
* If we got any ENOSPC errors on the previous run, destroy something.
*/
if (zs->zs_enospc_count != 0) {
+ /* Not expecting ENOSPC errors during raidz expansion tests */
+ ASSERT3U(ztest_opts.zo_raidz_expand_test, ==,
+ RAIDZ_EXPAND_NONE);
+
int d = ztest_random(ztest_opts.zo_datasets);
ztest_dataset_destroy(d);
}
@@ -7654,9 +8390,12 @@ ztest_run(ztest_shared_t *zs)
* that we always run the scrub whenever an indirect vdev exists
* because we have no way of knowing for sure if ztest_device_removal()
* fully completed its scrub before the pool was reimported.
+ *
+ * Does not apply for the RAIDZ expansion specific test runs
*/
- if (spa->spa_removing_phys.sr_state == DSS_SCANNING ||
- spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE &&
+ (spa->spa_removing_phys.sr_state == DSS_SCANNING ||
+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) {
while (spa->spa_removing_phys.sr_state == DSS_SCANNING)
txg_wait_synced(spa_get_dsl(spa), 0);
@@ -7666,9 +8405,6 @@ ztest_run(ztest_shared_t *zs)
ASSERT0(error);
}
- run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
- UMEM_NOFAIL);
-
if (ztest_opts.zo_verbose >= 4)
(void) printf("starting main threads...\n");
@@ -7681,43 +8417,12 @@ ztest_run(ztest_shared_t *zs)
(void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb,
NULL, DS_FIND_CHILDREN);
- /*
- * Kick off all the tests that run in parallel.
- */
- for (t = 0; t < ztest_opts.zo_threads; t++) {
- if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
- umem_free(run_threads, ztest_opts.zo_threads *
- sizeof (kthread_t *));
- return;
- }
-
- run_threads[t] = thread_create(NULL, 0, ztest_thread,
- (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
- defclsyspri);
- }
-
- /*
- * Wait for all of the tests to complete.
- */
- for (t = 0; t < ztest_opts.zo_threads; t++)
- VERIFY0(thread_join(run_threads[t]));
-
- /*
- * Close all datasets. This must be done after all the threads
- * are joined so we can be sure none of the datasets are in-use
- * by any of the threads.
- */
- for (t = 0; t < ztest_opts.zo_threads; t++) {
- if (t < ztest_opts.zo_datasets)
- ztest_dataset_close(t);
- }
-
- txg_wait_synced(spa_get_dsl(spa), 0);
-
- zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
- zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
-
- umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
+ if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED)
+ ztest_raidz_expand_run(zs, spa);
+ else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED)
+ ztest_raidz_expand_check(spa);
+ else
+ ztest_generic_run(zs, spa);
/* Kill the resume and deadman threads */
ztest_exiting = B_TRUE;
@@ -7826,6 +8531,7 @@ ztest_init(ztest_shared_t *zs)
mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+ raidz_scratch_verify();
kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
/*
@@ -7911,6 +8617,7 @@ shared_data_size(ztest_shared_hdr_t *hdr)
size += hdr->zh_size;
size += hdr->zh_stats_size * hdr->zh_stats_count;
size += hdr->zh_ds_size * hdr->zh_ds_count;
+ size += hdr->zh_scratch_state_size;
return (size);
}
@@ -7934,6 +8641,7 @@ setup_hdr(void)
hdr->zh_stats_count = ZTEST_FUNCS;
hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
hdr->zh_ds_count = ztest_opts.zo_datasets;
+ hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t);
size = shared_data_size(hdr);
VERIFY0(ftruncate(ztest_fd_data, size));
@@ -7968,6 +8676,8 @@ setup_data(void)
ztest_shared_callstate = (void *)&buf[offset];
offset += hdr->zh_stats_size * hdr->zh_stats_count;
ztest_shared_ds = (void *)&buf[offset];
+ offset += hdr->zh_ds_size * hdr->zh_ds_count;
+ ztest_scratch_state = (void *)&buf[offset];
}
static boolean_t
@@ -8188,13 +8898,14 @@ main(int argc, char **argv)
hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
if (ztest_opts.zo_verbose >= 1) {
- (void) printf("%"PRIu64" vdevs, %d datasets, %d threads,"
- "%d %s disks, %"PRIu64" seconds...\n\n",
+ (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, "
+ "%d %s disks, parity %d, %"PRIu64" seconds...\n\n",
ztest_opts.zo_vdevs,
ztest_opts.zo_datasets,
ztest_opts.zo_threads,
ztest_opts.zo_raid_children,
ztest_opts.zo_raid_type,
+ ztest_opts.zo_raid_parity,
ztest_opts.zo_time);
}
@@ -8307,6 +9018,9 @@ main(int argc, char **argv)
if (!ztest_opts.zo_mmp_test)
ztest_run_zdb(zs->zs_guid);
+ if (ztest_shared_opts->zo_raidz_expand_test ==
+ RAIDZ_EXPAND_CHECKED)
+ break; /* raidz expand test complete */
}
if (ztest_opts.zo_verbose >= 1) {