src - FreeBSD source tree

diff options


context:
space:
mode:

author	Martin Matuska <mm@FreeBSD.org>	2023-11-09 10:42:33 +0000
committer	Martin Matuska <mm@FreeBSD.org>	2023-11-09 12:19:17 +0000
commit	e716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch)
tree	3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/cmd
parent	f5b3e686292b6502878c64c3c154908024e06eb6 (diff)
parent	887a3c533b94a4b70075e310f15c45b9dee19410 (diff)

zfs: merge openzfs/zfs@887a3c533

Notable upstream pull request merges: #15022 5caeef02f RAID-Z expansion feature #15457 887a3c533 Increase L2ARC write rate and headroom #15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384 Obtained from: OpenZFS OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410

Diffstat (limited to 'sys/contrib/openzfs/cmd')

-rw-r--r--

sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c

-rw-r--r--

sys/contrib/openzfs/cmd/raidz_test/raidz_test.c

196

-rw-r--r--

sys/contrib/openzfs/cmd/raidz_test/raidz_test.h

-rw-r--r--

sys/contrib/openzfs/cmd/zdb/zdb.c

-rw-r--r--

sys/contrib/openzfs/cmd/zpool/zpool_main.c

148

-rw-r--r--

sys/contrib/openzfs/cmd/ztest.c

912

6 files changed, 963 insertions, 313 deletions

diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
index 8be08558b36d..730e6e1a040b 100644
--- a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c

@@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl)

if (rto_opts.rto_expand) {

rm_bench = vdev_raidz_map_alloc_expanded(

- zio_bench.io_abd,

- zio_bench.io_size, zio_bench.io_offset,

+ &zio_bench,

rto_opts.rto_ashift, ncols+1, ncols,

- fn+1, rto_opts.rto_expand_offset);

+ fn+1, rto_opts.rto_expand_offset,

+ 0, B_FALSE);

} else {

rm_bench = vdev_raidz_map_alloc(&zio_bench,

BENCH_ASHIFT, ncols, fn+1);

@@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl)

if (rto_opts.rto_expand) {

rm_bench = vdev_raidz_map_alloc_expanded(

- zio_bench.io_abd,

- zio_bench.io_size, zio_bench.io_offset,

+ &zio_bench,

BENCH_ASHIFT, ncols+1, ncols,

- PARITY_PQR, rto_opts.rto_expand_offset);

+ PARITY_PQR,

+ rto_opts.rto_expand_offset, 0, B_FALSE);

} else {

rm_bench = vdev_raidz_map_alloc(&zio_bench,

BENCH_ASHIFT, ncols, PARITY_PQR);

diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
index 195026d3a7ab..6a018ecf0737 100644
--- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c

@@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)

if (opts->rto_expand) {

opts->rm_golden =

- vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,

- opts->zio_golden->io_size, opts->zio_golden->io_offset,

+ vdev_raidz_map_alloc_expanded(opts->zio_golden,

opts->rto_ashift, total_ncols+1, total_ncols,

- parity, opts->rto_expand_offset);

- rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,

- zio_test->io_size, zio_test->io_offset,

+ parity, opts->rto_expand_offset, 0, B_FALSE);

+ rm_test = vdev_raidz_map_alloc_expanded(zio_test,

opts->rto_ashift, total_ncols+1, total_ncols,

- parity, opts->rto_expand_offset);

+ parity, opts->rto_expand_offset, 0, B_FALSE);

} else {

opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,

opts->rto_ashift, total_ncols, parity);

@@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)

return (err);

}

-/*

- * If reflow is not in progress, reflow_offset should be UINT64_MAX.

- * For each row, if the row is entirely before reflow_offset, it will

- * come from the new location. Otherwise this row will come from the

- * old location. Therefore, rows that straddle the reflow_offset will

- * come from the old location.

- *

- * NOTE: Until raidz expansion is implemented this function is only

- * needed by raidz_test.c to the multi-row raid_map_t functionality.

- */

-raidz_map_t *

-vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,

- uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,

- uint64_t nparity, uint64_t reflow_offset)

- /* The zio's size in units of the vdev's minimum sector size. */

- uint64_t s = size >> ashift;

- uint64_t q, r, bc, devidx, asize = 0, tot;

- /*

- * "Quotient": The number of data sectors for this stripe on all but

- * the "big column" child vdevs that also contain "remainder" data.

- * AKA "full rows"

- */

- q = s / (logical_cols - nparity);

- /*

- * "Remainder": The number of partial stripe data sectors in this I/O.

- * This will add a sector to some, but not all, child vdevs.

- */

- r = s - q * (logical_cols - nparity);

- /* The number of "big columns" - those which contain remainder data. */

- bc = (r == 0 ? 0 : r + nparity);

- /*

- * The total number of data and parity sectors associated with

- * this I/O.

- */

- tot = s + nparity * (q + (r == 0 ? 0 : 1));

- /* How many rows contain data (not skip) */

- uint64_t rows = howmany(tot, logical_cols);

- int cols = MIN(tot, logical_cols);

- raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),

- KM_SLEEP);

- rm->rm_nrows = rows;

- for (uint64_t row = 0; row < rows; row++) {

- raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,

- rr_col[cols]), KM_SLEEP);

- rm->rm_row[row] = rr;

- /* The starting RAIDZ (parent) vdev sector of the row. */

- uint64_t b = (offset >> ashift) + row * logical_cols;

- /*

- * If we are in the middle of a reflow, and any part of this

- * row has not been copied, then use the old location of

- * this row.

- */

- int row_phys_cols = physical_cols;

- if (b + (logical_cols - nparity) > reflow_offset >> ashift)

- row_phys_cols--;

- /* starting child of this row */

- uint64_t child_id = b % row_phys_cols;

- /* The starting byte offset on each child vdev. */

- uint64_t child_offset = (b / row_phys_cols) << ashift;

- /*

- * We set cols to the entire width of the block, even

- * if this row is shorter. This is needed because parity

- * generation (for Q and R) needs to know the entire width,

- * because it treats the short row as though it was

- * full-width (and the "phantom" sectors were zero-filled).

- *

- * Another approach to this would be to set cols shorter

- * (to just the number of columns that we might do i/o to)

- * and have another mechanism to tell the parity generation

- * about the "entire width". Reconstruction (at least

- * vdev_raidz_reconstruct_general()) would also need to

- * know about the "entire width".

- */

- rr->rr_cols = cols;

- rr->rr_bigcols = bc;

- rr->rr_missingdata = 0;

- rr->rr_missingparity = 0;

- rr->rr_firstdatacol = nparity;

- rr->rr_abd_empty = NULL;

- rr->rr_nempty = 0;

- for (int c = 0; c < rr->rr_cols; c++, child_id++) {

- if (child_id >= row_phys_cols) {

- child_id -= row_phys_cols;

- child_offset += 1ULL << ashift;

- }

- rr->rr_col[c].rc_devidx = child_id;

- rr->rr_col[c].rc_offset = child_offset;

- rr->rr_col[c].rc_orig_data = NULL;

- rr->rr_col[c].rc_error = 0;

- rr->rr_col[c].rc_tried = 0;

- rr->rr_col[c].rc_skipped = 0;

- rr->rr_col[c].rc_need_orig_restore = B_FALSE;

- uint64_t dc = c - rr->rr_firstdatacol;

- if (c < rr->rr_firstdatacol) {

- rr->rr_col[c].rc_size = 1ULL << ashift;

- rr->rr_col[c].rc_abd =

- abd_alloc_linear(rr->rr_col[c].rc_size,

- B_TRUE);

- } else if (row == rows - 1 && bc != 0 && c >= bc) {

- /*

- * Past the end, this for parity generation.

- */

- rr->rr_col[c].rc_size = 0;

- rr->rr_col[c].rc_abd = NULL;

- } else {

- /*

- * "data column" (col excluding parity)

- * Add an ASCII art diagram here

- */

- uint64_t off;

- if (c < bc || r == 0) {

- off = dc * rows + row;

- } else {

- off = r * rows +

- (dc - r) * (rows - 1) + row;

- }

- rr->rr_col[c].rc_size = 1ULL << ashift;

- rr->rr_col[c].rc_abd = abd_get_offset_struct(

- &rr->rr_col[c].rc_abdstruct,

- abd, off << ashift, 1 << ashift);

- }

- asize += rr->rr_col[c].rc_size;

- }

- /*

- * If all data stored spans all columns, there's a danger that

- * parity will always be on the same device and, since parity

- * isn't read during normal operation, that that device's I/O

- * bandwidth won't be used effectively. We therefore switch

- * the parity every 1MB.

- *

- * ...at least that was, ostensibly, the theory. As a practical

- * matter unless we juggle the parity between all devices

- * evenly, we won't see any benefit. Further, occasional writes

- * that aren't a multiple of the LCM of the number of children

- * and the minimum stripe width are sufficient to avoid pessimal

- * behavior. Unfortunately, this decision created an implicit

- * on-disk format requirement that we need to support for all

- * eternity, but only for single-parity RAID-Z.

- *

- * If we intend to skip a sector in the zeroth column for

- * padding we must make sure to note this swap. We will never

- * intend to skip the first column since at least one data and

- * one parity column must appear in each row.

- */

- if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&

- (offset & (1ULL << 20))) {

- ASSERT(rr->rr_cols >= 2);

- ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);

- devidx = rr->rr_col[0].rc_devidx;

- uint64_t o = rr->rr_col[0].rc_offset;

- rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;

- rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;

- rr->rr_col[1].rc_devidx = devidx;

- rr->rr_col[1].rc_offset = o;

- }

- ASSERT3U(asize, ==, tot << ashift);

- /* init RAIDZ parity ops */

- rm->rm_ops = vdev_raidz_math_get_ops();

- return (rm);

static raidz_map_t *

init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)

{

@@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)

init_zio_abd(*zio);

if (opts->rto_expand) {

- rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,

- (*zio)->io_size, (*zio)->io_offset,

+ rm = vdev_raidz_map_alloc_expanded(*zio,

opts->rto_ashift, total_ncols+1, total_ncols,

- parity, opts->rto_expand_offset);

+ parity, opts->rto_expand_offset, 0, B_FALSE);

} else {

rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,

total_ncols, parity);

diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
index 163929defc73..f912e281f6f3 100644
--- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h

@@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio);

void run_raidz_benchmark(void);

-struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,

- uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);

#endif /* RAIDZ_TEST_H */

diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
index 3c282f3fc975..18221c4b92d2 100644
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c

@@ -4134,6 +4134,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)

}

(void) printf("\tcheckpoint_txg = %llu\n",

(u_longlong_t)ub->ub_checkpoint_txg);

+ (void) printf("\traidz_reflow state=%u off=%llu\n",

+ (int)RRSS_GET_STATE(ub),

+ (u_longlong_t)RRSS_GET_OFFSET(ub));

(void) printf("%s", footer ? footer : "");

}

diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
index 5507f9d3fd67..9dd1d2109004 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c

@@ -6650,9 +6650,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)

ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,

rebuild);

- if (ret == 0 && wait)

- ret = zpool_wait(zhp,

- replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER);

+ if (ret == 0 && wait) {

+ zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER;

+ char raidz_prefix[] = "raidz";

+ if (replacing) {

+ activity = ZPOOL_WAIT_REPLACE;

+ } else if (strncmp(old_disk,

+ raidz_prefix, strlen(raidz_prefix)) == 0) {

+ activity = ZPOOL_WAIT_RAIDZ_EXPAND;

+ }

+ ret = zpool_wait(zhp, activity);

+ }

nvlist_free(props);

nvlist_free(nvroot);

@@ -6678,17 +6686,21 @@ zpool_do_replace(int argc, char **argv)

}

- * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>

+ * zpool attach [-fsw] [-o property=value] <pool> <device>|<vdev> <new_device>

* -f Force attach, even if <new_device> appears to be in use.

* -s Use sequential instead of healing reconstruction for resilver.

* -o Set property=value.

- * -w Wait for resilvering to complete before returning

+ * -w Wait for resilvering (mirror) or expansion (raidz) to complete

+ * before returning.

- * Attach <new_device> to the mirror containing <device>. If <device> is not

- * part of a mirror, then <device> will be transformed into a mirror of

- * <device> and <new_device>. In either case, <new_device> will begin life

- * with a DTL of [0, now], and will immediately begin to resilver itself.

+ * Attach <new_device> to a <device> or <vdev>, where the vdev can be of type

+ * mirror or raidz. If <device> is not part of a mirror, then <device> will

+ * be transformed into a mirror of <device> and <new_device>. When a mirror

+ * is involved, <new_device> will begin life with a DTL of [0, now], and will

+ * immediately begin to resilver itself. For the raidz case, a expansion will

+ * commence and reflow the raidz data across all the disks including the

+ * <new_device>.

int

zpool_do_attach(int argc, char **argv)

@@ -8195,6 +8207,97 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)

}

+/*

+ * Print out detailed raidz expansion status.

+ */

+static void

+print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres)

+ char copied_buf[7];

+ if (pres == NULL || pres->pres_state == DSS_NONE)

+ return;

+ /*

+ * Determine name of vdev.

+ */

+ nvlist_t *config = zpool_get_config(zhp, NULL);

+ nvlist_t *nvroot = fnvlist_lookup_nvlist(config,

+ ZPOOL_CONFIG_VDEV_TREE);

+ nvlist_t **child;

+ uint_t children;

+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,

+ &child, &children) == 0);

+ assert(pres->pres_expanding_vdev < children);

+ printf_color(ANSI_BOLD, gettext("expand: "));

+ time_t start = pres->pres_start_time;

+ time_t end = pres->pres_end_time;

+ char *vname =

+ zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0);

+ zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf));

+ /*

+ * Expansion is finished or canceled.

+ */

+ if (pres->pres_state == DSS_FINISHED) {

+ char time_buf[32];

+ secs_to_dhms(end - start, time_buf);

+ (void) printf(gettext("expanded %s-%u copied %s in %s, "

+ "on %s"), vname, (int)pres->pres_expanding_vdev,

+ copied_buf, time_buf, ctime((time_t *)&end));

+ } else {

+ char examined_buf[7], total_buf[7], rate_buf[7];

+ uint64_t copied, total, elapsed, secs_left;

+ double fraction_done;

+ uint_t rate;

+ assert(pres->pres_state == DSS_SCANNING);

+ /*

+ * Expansion is in progress.

+ */

+ (void) printf(gettext(

+ "expansion of %s-%u in progress since %s"),

+ vname, (int)pres->pres_expanding_vdev, ctime(&start));

+ copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1;

+ total = pres->pres_to_reflow;

+ fraction_done = (double)copied / total;

+ /* elapsed time for this pass */

+ elapsed = time(NULL) - pres->pres_start_time;

+ elapsed = elapsed > 0 ? elapsed : 1;

+ rate = copied / elapsed;

+ rate = rate > 0 ? rate : 1;

+ secs_left = (total - copied) / rate;

+ zfs_nicenum(copied, examined_buf, sizeof (examined_buf));

+ zfs_nicenum(total, total_buf, sizeof (total_buf));

+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));

+ /*

+ * do not print estimated time if hours_left is more than

+ * 30 days

+ */

+ (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"),

+ examined_buf, total_buf, rate_buf, 100 * fraction_done);

+ if (pres->pres_waiting_for_resilver) {

+ (void) printf(gettext(", paused for resilver or "

+ "clear\n"));

+ } else if (secs_left < (30 * 24 * 3600)) {

+ char time_buf[32];

+ secs_to_dhms(secs_left, time_buf);

+ (void) printf(gettext(", %s to go\n"), time_buf);

+ } else {

+ (void) printf(gettext(

+ ", (copy is slow, no estimated time)\n"));

+ }

+ free(vname);

static void

print_checkpoint_status(pool_checkpoint_stat_t *pcs)

{

@@ -8772,19 +8875,24 @@ status_callback(zpool_handle_t *zhp, void *data)

uint64_t nerr;

nvlist_t **spares, **l2cache;

uint_t nspares, nl2cache;

- pool_checkpoint_stat_t *pcs = NULL;

- pool_removal_stat_t *prs = NULL;

print_scan_status(zhp, nvroot);

+ pool_removal_stat_t *prs = NULL;

(void) nvlist_lookup_uint64_array(nvroot,

ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);

print_removal_status(zhp, prs);

+ pool_checkpoint_stat_t *pcs = NULL;

(void) nvlist_lookup_uint64_array(nvroot,

ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);

print_checkpoint_status(pcs);

+ pool_raidz_expand_stat_t *pres = NULL;

+ (void) nvlist_lookup_uint64_array(nvroot,

+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c);

+ print_raidz_expand_status(zhp, pres);

cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,

cbp->cb_name_flags | VDEV_NAME_TYPE_ID);

if (cbp->cb_namewidth < 10)

@@ -10738,8 +10846,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)

pool_checkpoint_stat_t *pcs = NULL;

pool_scan_stat_t *pss = NULL;

pool_removal_stat_t *prs = NULL;

+ pool_raidz_expand_stat_t *pres = NULL;

const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE",

- "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"};

+ "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"};

int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];

/* Calculate the width of each column */

@@ -10798,6 +10907,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)

vdev_activity_top_remaining(nvroot);

}

+ (void) nvlist_lookup_uint64_array(nvroot,

+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c);

+ if (pres != NULL && pres->pres_state == DSS_SCANNING) {

+ int64_t rem = pres->pres_to_reflow - pres->pres_reflowed;

+ bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem;

+ }

bytes_rem[ZPOOL_WAIT_INITIALIZE] =

vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);

bytes_rem[ZPOOL_WAIT_TRIM] =

@@ -10827,11 +10943,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)

if (!wd->wd_enabled[i])

continue;

- if (wd->wd_exact)

+ if (wd->wd_exact) {

(void) snprintf(buf, sizeof (buf), "%" PRIi64,

bytes_rem[i]);

- else

+ } else {

zfs_nicenum(bytes_rem[i], buf, sizeof (buf));

+ }

if (wd->wd_scripted)

(void) printf(i == 0 ? "%s" : "\t%s", buf);

@@ -10937,7 +11054,8 @@ zpool_do_wait(int argc, char **argv)

for (char *tok; (tok = strsep(&optarg, ",")); ) {

static const char *const col_opts[] = {

"discard", "free", "initialize", "replace",

- "remove", "resilver", "scrub", "trim" };

+ "remove", "resilver", "scrub", "trim",

+ "raidz_expand" };

for (i = 0; i < ARRAY_SIZE(col_opts); ++i)

if (strcmp(tok, col_opts[i]) == 0) {

diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c
index 8cfbdfe1c2e2..1d414a9f6fd5 100644
--- a/sys/contrib/openzfs/cmd/ztest.c
+++ b/sys/contrib/openzfs/cmd/ztest.c

@@ -151,6 +151,7 @@ typedef struct ztest_shared_hdr {

uint64_t zh_stats_count;

uint64_t zh_ds_size;

uint64_t zh_ds_count;

+ uint64_t zh_scratch_state_size;

} ztest_shared_hdr_t;

static ztest_shared_hdr_t *ztest_shared_hdr;

@@ -161,6 +162,16 @@ enum ztest_class_state {

ZTEST_VDEV_CLASS_RND

};

+/* Dedicated RAIDZ Expansion test states */

+typedef enum {

+ RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */

+ RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */

+ RAIDZ_EXPAND_STARTED, /* Testing has commenced */

+ RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */

+ RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */

+} raidz_expand_test_state_t;

#define ZO_GVARS_MAX_ARGLEN ((size_t)64)

#define ZO_GVARS_MAX_COUNT ((size_t)10)

@@ -174,6 +185,7 @@ typedef struct ztest_shared_opts {

size_t zo_vdev_size;

int zo_ashift;

int zo_mirrors;

+ int zo_raid_do_expand;

int zo_raid_children;

int zo_raid_parity;

char zo_raid_type[8];

@@ -188,6 +200,7 @@ typedef struct ztest_shared_opts {

uint64_t zo_time;

uint64_t zo_maxloops;

uint64_t zo_metaslab_force_ganging;

+ raidz_expand_test_state_t zo_raidz_expand_test;

int zo_mmp_test;

int zo_special_vdevs;

int zo_dump_dbgmsg;

@@ -249,6 +262,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {

.zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING,

.zo_special_vdevs = ZTEST_VDEV_CLASS_RND,

.zo_gvars_count = 0,

+ .zo_raidz_expand_test = RAIDZ_EXPAND_NONE,

};

extern uint64_t metaslab_force_ganging;

@@ -261,6 +275,8 @@ extern uint_t dmu_object_alloc_chunk_shift;

extern boolean_t zfs_force_some_double_word_sm_entries;

extern unsigned long zio_decompress_fail_fraction;

extern unsigned long zfs_reconstruct_indirect_damage_fraction;

+extern uint64_t raidz_expand_max_reflow_bytes;

+extern uint_t raidz_expand_pause_point;

static ztest_shared_opts_t *ztest_shared_opts;

@@ -274,6 +290,12 @@ typedef struct ztest_shared_ds {

static ztest_shared_ds_t *ztest_shared_ds;

#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])

+typedef struct ztest_scratch_state {

+ uint64_t zs_raidz_scratch_verify_pause;

+} ztest_shared_scratch_state_t;

+static ztest_shared_scratch_state_t *ztest_scratch_state;

#define BT_MAGIC 0x123456789abcdefULL

#define MAXFAULTS(zs) \

(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1)

@@ -311,9 +333,9 @@ typedef struct bufwad {

* still need to map from object ID to rangelock_t.

typedef enum {

- RL_READER,

- RL_WRITER,

- RL_APPEND

+ ZTRL_READER,

+ ZTRL_WRITER,

+ ZTRL_APPEND

} rl_type_t;

typedef struct rll {

@@ -408,6 +430,7 @@ ztest_func_t ztest_mmp_enable_disable;

ztest_func_t ztest_scrub;

ztest_func_t ztest_dsl_dataset_promote_busy;

ztest_func_t ztest_vdev_attach_detach;

+ztest_func_t ztest_vdev_raidz_attach;

ztest_func_t ztest_vdev_LUN_growth;

ztest_func_t ztest_vdev_add_remove;

ztest_func_t ztest_vdev_class_add;

@@ -465,6 +488,7 @@ static ztest_info_t ztest_info[] = {

ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely),

ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely),

ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),

+ ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes),

ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),

ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),

ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime),

@@ -745,7 +769,7 @@ static ztest_option_t option_table[] = {

DEFAULT_RAID_CHILDREN, NULL},

{ 'R', "raid-parity", "INTEGER", "Raid parity",

DEFAULT_RAID_PARITY, NULL},

- { 'K', "raid-kind", "raidz|draid|random", "Raid kind",

+ { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind",

NO_DEFAULT, "random"},

{ 'D', "draid-data", "INTEGER", "Number of draid data drives",

DEFAULT_DRAID_DATA, NULL},

@@ -781,6 +805,9 @@ static ztest_option_t option_table[] = {

NO_DEFAULT, NULL},

{ 'C', "vdev-class-state", "on|off|random", "vdev class state",

NO_DEFAULT, "random"},

+ { 'X', "raidz-expansion", NULL,

+ "Perform a dedicated raidz expansion test",

+ NO_DEFAULT, NULL},

{ 'o', "option", "\"OPTION=INTEGER\"",

"Set global variable to an unsigned 32-bit integer value",

NO_DEFAULT, NULL},

@@ -853,7 +880,7 @@ usage(boolean_t requested)

option_table[i].short_opt,

option_table[i].long_opt);

}

- (void) fprintf(fp, " %-40s%s", option,

+ (void) fprintf(fp, " %-43s%s", option,

option_table[i].comment);

if (option_table[i].long_opt_param != NULL) {

@@ -1027,6 +1054,9 @@ process_options(int argc, char **argv)

case 'V':

zo->zo_verbose++;

break;

+ case 'X':

+ zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED;

+ break;

case 'E':

zo->zo_init = 0;

break;

@@ -1078,9 +1108,28 @@ process_options(int argc, char **argv)

fini_options();

- /* When raid choice is 'random' add a draid pool 50% of the time */

+ /* Force compatible options for raidz expansion run */

+ if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) {

+ zo->zo_mmp_test = 0;

+ zo->zo_mirrors = 0;

+ zo->zo_vdevs = 1;

+ zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2;

+ zo->zo_raid_do_expand = B_FALSE;

+ raid_kind = "raidz";

+ }

if (strcmp(raid_kind, "random") == 0) {

- raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz";

+ switch (ztest_random(3)) {

+ case 0:

+ raid_kind = "raidz";

+ break;

+ case 1:

+ raid_kind = "eraidz";

+ break;

+ case 2:

+ raid_kind = "draid";

+ break;

+ }

if (ztest_opts.zo_verbose >= 3)

(void) printf("choosing RAID type '%s'\n", raid_kind);

@@ -1119,6 +1168,18 @@ process_options(int argc, char **argv)

(void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID,

sizeof (zo->zo_raid_type));

+ } else if (strcmp(raid_kind, "eraidz") == 0) {

+ /* using eraidz (expandable raidz) */

+ zo->zo_raid_do_expand = B_TRUE;

+ /* tests expect top-level to be raidz */

+ zo->zo_mirrors = 0;

+ zo->zo_vdevs = 1;

+ /* Make sure parity is less than data columns */

+ zo->zo_raid_parity = MIN(zo->zo_raid_parity,

+ zo->zo_raid_children - 1);

} else /* using raidz */ {

ASSERT0(strcmp(raid_kind, "raidz"));

@@ -1166,9 +1227,29 @@ ztest_kill(ztest_shared_t *zs)

* Before we kill ourselves, make sure that the config is updated.

* See comment above spa_write_cachefile().

- mutex_enter(&spa_namespace_lock);

- spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE);

- mutex_exit(&spa_namespace_lock);

+ if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) {

+ if (mutex_tryenter(&spa_namespace_lock)) {

+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE,

+ B_FALSE);

+ mutex_exit(&spa_namespace_lock);

+ ztest_scratch_state->zs_raidz_scratch_verify_pause =

+ raidz_expand_pause_point;

+ } else {

+ /*

+ * Do not verify scratch object in case if

+ * spa_namespace_lock cannot be acquired,

+ * it can cause deadlock in spa_config_update().

+ */

+ raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE;

+ return;

+ }

+ } else {

+ mutex_enter(&spa_namespace_lock);

+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE);

+ mutex_exit(&spa_namespace_lock);

+ }

(void) raise(SIGKILL);

}

@@ -1615,7 +1696,7 @@ ztest_rll_lock(rll_t *rll, rl_type_t type)

{

mutex_enter(&rll->rll_lock);

- if (type == RL_READER) {

+ if (type == ZTRL_READER) {

while (rll->rll_writer != NULL)

(void) cv_wait(&rll->rll_cv, &rll->rll_lock);

rll->rll_readers++;

@@ -2071,7 +2152,7 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap)

zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));

ASSERT3U(object, !=, 0);

- ztest_object_lock(zd, object, RL_WRITER);

+ ztest_object_lock(zd, object, ZTRL_WRITER);

VERIFY0(dmu_object_info(os, object, &doi));

@@ -2141,8 +2222,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)

if (bt->bt_magic != BT_MAGIC)

bt = NULL;

- ztest_object_lock(zd, lr->lr_foid, RL_READER);

- rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);

+ ztest_object_lock(zd, lr->lr_foid, ZTRL_READER);

+ rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER);

VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));

@@ -2245,9 +2326,9 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)

if (byteswap)

byteswap_uint64_array(lr, sizeof (*lr));

- ztest_object_lock(zd, lr->lr_foid, RL_READER);

+ ztest_object_lock(zd, lr->lr_foid, ZTRL_READER);

rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,

- RL_WRITER);

+ ZTRL_WRITER);

tx = dmu_tx_create(os);

@@ -2287,7 +2368,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)

if (byteswap)

byteswap_uint64_array(lr, sizeof (*lr));

- ztest_object_lock(zd, lr->lr_foid, RL_WRITER);

+ ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER);

VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));

@@ -2414,7 +2495,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,

ASSERT3P(lwb, !=, NULL);

ASSERT3U(size, !=, 0);

- ztest_object_lock(zd, object, RL_READER);

+ ztest_object_lock(zd, object, ZTRL_READER);

error = dmu_bonus_hold(os, object, FTAG, &db);

if (error) {

ztest_object_unlock(zd, object);

@@ -2439,7 +2520,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,

if (buf != NULL) { /* immediate write */

zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,

- object, offset, size, RL_READER);

+ object, offset, size, ZTRL_READER);

error = dmu_read(os, object, offset, size, buf,

DMU_READ_NO_PREFETCH);

@@ -2455,7 +2536,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,

}

zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,

- object, offset, size, RL_READER);

+ object, offset, size, ZTRL_READER);

error = dmu_buf_hold_noread(os, object, offset, zgd, &db);

@@ -2531,7 +2612,7 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)

ASSERT3U(od->od_object, !=, 0);

ASSERT0(missing); /* there should be no gaps */

- ztest_object_lock(zd, od->od_object, RL_READER);

+ ztest_object_lock(zd, od->od_object, ZTRL_READER);

VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object,

FTAG, &db));

dmu_object_info_from_db(db, &doi);

@@ -2634,7 +2715,7 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)

static int

ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,

- void *data)

+ const void *data)

{

lr_write_t *lr;

int error;

@@ -2704,8 +2785,8 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)

txg_wait_synced(dmu_objset_pool(os), 0);

- ztest_object_lock(zd, object, RL_READER);

- rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);

+ ztest_object_lock(zd, object, ZTRL_READER);

+ rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER);

tx = dmu_tx_create(os);

@@ -3033,13 +3114,32 @@ ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id)

spa_config_exit(spa, SCL_CONFIG, FTAG);

}

+static int

+ztest_get_raidz_children(spa_t *spa)

+ (void) spa;

+ vdev_t *raidvd;

+ ASSERT(MUTEX_HELD(&ztest_vdev_lock));

+ if (ztest_opts.zo_raid_do_expand) {

+ raidvd = ztest_spa->spa_root_vdev->vdev_child[0];

+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);

+ return (raidvd->vdev_children);

+ }

+ return (ztest_opts.zo_raid_children);

void

ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)

{

(void) zd, (void) id;

spa_t *spa;

uint64_t initial_version = SPA_VERSION_INITIAL;

- uint64_t version, newversion;

+ uint64_t raidz_children, version, newversion;

nvlist_t *nvroot, *props;

char *name;

@@ -3058,8 +3158,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)

(void) spa_destroy(name);

+ raidz_children = ztest_get_raidz_children(ztest_spa);

nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,

- NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1);

+ NULL, raidz_children, ztest_opts.zo_mirrors, 1);

* If we're configuring a RAIDZ device then make sure that the

@@ -3125,6 +3227,7 @@ ztest_spa_checkpoint(spa_t *spa)

case ZFS_ERR_DEVRM_IN_PROGRESS:

case ZFS_ERR_DISCARDING_CHECKPOINT:

case ZFS_ERR_CHECKPOINT_EXISTS:

+ case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS:

break;

case ENOSPC:

ztest_record_enospc(FTAG);

@@ -3205,6 +3308,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)

spa_t *spa = ztest_spa;

uint64_t leaves;

uint64_t guid;

+ uint64_t raidz_children;

nvlist_t *nvroot;

int error;

@@ -3212,8 +3317,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)

return;

mutex_enter(&ztest_vdev_lock);

- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *

- ztest_opts.zo_raid_children;

+ raidz_children = ztest_get_raidz_children(spa);

+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children;

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

@@ -3267,7 +3372,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)

nvroot = make_vdev_root(NULL, NULL, NULL,

ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?

- "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors,

+ "log" : NULL, raidz_children, zs->zs_mirrors,

1);

error = spa_vdev_add(spa, nvroot);

@@ -3295,6 +3400,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)

spa_t *spa = ztest_spa;

uint64_t leaves;

nvlist_t *nvroot;

+ uint64_t raidz_children;

const char *class = (ztest_random(2) == 0) ?

VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;

int error;

@@ -3322,15 +3428,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)

return;

}

- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *

- ztest_opts.zo_raid_children;

+ raidz_children = ztest_get_raidz_children(spa);

+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children;

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;

spa_config_exit(spa, SCL_VDEV, FTAG);

nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,

- class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);

+ class, raidz_children, zs->zs_mirrors, 1);

error = spa_vdev_add(spa, nvroot);

fnvlist_free(nvroot);

@@ -3592,6 +3698,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)

uint64_t ashift = ztest_get_ashift();

uint64_t oldguid, pguid;

uint64_t oldsize, newsize;

+ uint64_t raidz_children;

char *oldpath, *newpath;

int replacing;

int oldvd_has_siblings = B_FALSE;

@@ -3608,7 +3715,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)

newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);

mutex_enter(&ztest_vdev_lock);

- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;

+ raidz_children = ztest_get_raidz_children(spa);

+ leaves = MAX(zs->zs_mirrors, 1) * raidz_children;

spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

@@ -3624,6 +3732,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)

}

+ * RAIDZ leaf VDEV mirrors are not currently supported while a

+ * RAIDZ expansion is in progress.

+ */

+ if (ztest_opts.zo_raid_do_expand) {

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ goto out;

+ }

+ /*

* Decide whether to do an attach or a replace.

replacing = ztest_random(2);

@@ -3647,7 +3764,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)

if (zs->zs_mirrors >= 1) {

ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops);

ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors);

- oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children];

+ oldvd = oldvd->vdev_child[leaf / raidz_children];

}

/* pick a child out of the raidz group */

@@ -3656,8 +3773,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)

ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops);

else

ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops);

- ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children);

- oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children];

+ oldvd = oldvd->vdev_child[leaf % raidz_children];

}

@@ -3825,6 +3941,226 @@ out:

umem_free(newpath, MAXPATHLEN);

}

+static void

+raidz_scratch_verify(void)

+ spa_t *spa;

+ uint64_t write_size, logical_size, offset;

+ raidz_reflow_scratch_state_t state;

+ vdev_raidz_expand_t *vre;

+ vdev_t *raidvd;

+ ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE);

+ if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0)

+ return;

+ kernel_init(SPA_MODE_READ);

+ mutex_enter(&spa_namespace_lock);

+ spa = spa_lookup(ztest_opts.zo_pool);

+ ASSERT(spa);

+ spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;

+ mutex_exit(&spa_namespace_lock);

+ VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));

+ ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX);

+ mutex_enter(&ztest_vdev_lock);

+ spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);

+ vre = spa->spa_raidz_expand;

+ if (vre == NULL)

+ goto out;

+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ offset = RRSS_GET_OFFSET(&spa->spa_uberblock);

+ state = RRSS_GET_STATE(&spa->spa_uberblock);

+ write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift);

+ logical_size = write_size * raidvd->vdev_children;

+ switch (state) {

+ /*

+ * Initial state of reflow process. RAIDZ expansion was

+ * requested by user, but scratch object was not created.

+ */

+ case RRSS_SCRATCH_NOT_IN_USE:

+ ASSERT3U(offset, ==, 0);

+ break;

+ /*

+ * Scratch object was synced and stored in boot area.

+ */

+ case RRSS_SCRATCH_VALID:

+ /*

+ * Scratch object was synced back to raidz start offset,

+ * raidz is ready for sector by sector reflow process.

+ */

+ case RRSS_SCRATCH_INVALID_SYNCED:

+ /*

+ * Scratch object was synced back to raidz start offset

+ * on zpool importing, raidz is ready for sector by sector

+ * reflow process.

+ */

+ case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT:

+ ASSERT3U(offset, ==, logical_size);

+ break;

+ /*

+ * Sector by sector reflow process started.

+ */

+ case RRSS_SCRATCH_INVALID_SYNCED_REFLOW:

+ ASSERT3U(offset, >=, logical_size);

+ break;

+ }

+out:

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ mutex_exit(&ztest_vdev_lock);

+ ztest_scratch_state->zs_raidz_scratch_verify_pause = 0;

+ spa_close(spa, FTAG);

+ kernel_fini();

+static void

+ztest_scratch_thread(void *arg)

+ (void) arg;

+ /* wait up to 10 seconds */

+ for (int t = 100; t > 0; t -= 1) {

+ if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE)

+ thread_exit();

+ (void) poll(NULL, 0, 100);

+ }

+ /* killed when the scratch area progress reached a certain point */

+ ztest_kill(ztest_shared);

+/*

+ * Verify that we can attach raidz device.

+ */

+void

+ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id)

+ (void) zd, (void) id;

+ ztest_shared_t *zs = ztest_shared;

+ spa_t *spa = ztest_spa;

+ uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift();

+ kthread_t *scratch_thread = NULL;

+ vdev_t *newvd, *pvd;

+ nvlist_t *root;

+ char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);

+ int error, expected_error = 0;

+ mutex_enter(&ztest_vdev_lock);

+ spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);

+ /* Only allow attach when raid-kind = 'eraidz' */

+ if (!ztest_opts.zo_raid_do_expand) {

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ goto out;

+ }

+ if (ztest_opts.zo_mmp_test) {

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ goto out;

+ }

+ if (ztest_device_removal_active) {

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ goto out;

+ }

+ pvd = vdev_lookup_top(spa, 0);

+ ASSERT(pvd->vdev_ops == &vdev_raidz_ops);

+ /*

+ * Get size of a child of the raidz group,

+ * make sure device is a bit bigger

+ */

+ newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)];

+ newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2));

+ /*

+ * Get next attached leaf id

+ */

+ raidz_children = ztest_get_raidz_children(spa);

+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children;

+ zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;

+ if (spa->spa_raidz_expand)

+ expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS;

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ /*

+ * Path to vdev to be attached

+ */

+ (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,

+ ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf);

+ /*

+ * Build the nvlist describing newpath.

+ */

+ root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL,

+ 0, 0, 1);

+ /*

+ * 50% of the time, set raidz_expand_pause_point to cause

+ * raidz_reflow_scratch_sync() to pause at a certain point and

+ * then kill the test after 10 seconds so raidz_scratch_verify()

+ * can confirm consistency when the pool is imported.

+ */

+ if (ztest_random(2) == 0 && expected_error == 0) {

+ raidz_expand_pause_point =

+ ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1;

+ scratch_thread = thread_create(NULL, 0, ztest_scratch_thread,

+ ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);

+ }

+ error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE);

+ nvlist_free(root);

+ if (error == EOVERFLOW || error == ENXIO ||

+ error == ZFS_ERR_CHECKPOINT_EXISTS ||

+ error == ZFS_ERR_DISCARDING_CHECKPOINT)

+ expected_error = error;

+ if (error != 0 && error != expected_error) {

+ fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d",

+ newpath, newsize, error, expected_error);

+ }

+ if (raidz_expand_pause_point) {

+ if (error != 0) {

+ /*

+ * Do not verify scratch object in case of error

+ * returned by vdev attaching.

+ */

+ raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE;

+ }

+ VERIFY0(thread_join(scratch_thread));

+ }

+out:

+ mutex_exit(&ztest_vdev_lock);

+ umem_free(newpath, MAXPATHLEN);

void

ztest_device_removal(ztest_ds_t *zd, uint64_t id)

{

@@ -4031,6 +4367,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)

return;

}

+ /*

+ * If we are under raidz expansion, the test can failed because the

+ * metaslabs count will not increase immediately after the vdev is

+ * expanded. It will happen only after raidz expansion completion.

+ */

+ if (spa->spa_raidz_expand) {

+ spa_config_exit(spa, SCL_STATE, spa);

+ mutex_exit(&ztest_vdev_lock);

+ mutex_exit(&ztest_checkpoint_lock);

+ return;

+ }

top = ztest_random_vdev_top(spa, B_TRUE);

tvd = spa->spa_root_vdev->vdev_child[top];

@@ -5815,7 +6163,7 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)

dmu_object_info_t doi;

dmu_buf_t *db;

- ztest_object_lock(zd, obj, RL_READER);

+ ztest_object_lock(zd, obj, ZTRL_READER);

if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) {

ztest_object_unlock(zd, obj);

continue;

@@ -6038,6 +6386,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)

uint64_t leaves;

uint64_t bad = 0x1990c0ffeedecadeull;

uint64_t top, leaf;

+ uint64_t raidz_children;

char *path0;

char *pathrand;

size_t fsize;

@@ -6048,6 +6397,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)

vdev_t *vd0 = NULL;

uint64_t guid0 = 0;

boolean_t islog = B_FALSE;

+ boolean_t injected = B_FALSE;

path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);

pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);

@@ -6060,15 +6410,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)

* strategy for damaging blocks does not take in to account evacuated

* blocks which may have already been damaged.

- if (ztest_device_removal_active) {

- mutex_exit(&ztest_vdev_lock);

+ if (ztest_device_removal_active)

+ goto out;

+ /*

+ * The fault injection strategy for damaging blocks cannot be used

+ * if raidz expansion is in progress. The leaves value

+ * (attached raidz children) is variable and strategy for damaging

+ * blocks will corrupt same data blocks on different child vdevs

+ * because of the reflow process.

+ */

+ if (spa->spa_raidz_expand != NULL)

goto out;

- }

maxfaults = MAXFAULTS(zs);

- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;

+ raidz_children = ztest_get_raidz_children(spa);

+ leaves = MAX(zs->zs_mirrors, 1) * raidz_children;

mirror_save = zs->zs_mirrors;

- mutex_exit(&ztest_vdev_lock);

ASSERT3U(leaves, >=, 1);

@@ -6209,13 +6567,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)

* call vdev_[on|off]line without holding locks

* to force unpredictable failures but the side

* effects of vdev_[on|off]line prevent us from

- * doing so. We grab the ztest_vdev_lock here to

- * prevent a race between injection testing and

- * aux_vdev removal.

+ * doing so.

- mutex_enter(&ztest_vdev_lock);

(void) vdev_online(spa, guid0, 0, NULL);

- mutex_exit(&ztest_vdev_lock);

}

@@ -6289,9 +6643,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)

offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)

continue;

- mutex_enter(&ztest_vdev_lock);

if (mirror_save != zs->zs_mirrors) {

- mutex_exit(&ztest_vdev_lock);

(void) close(fd);

goto out;

}

@@ -6301,15 +6653,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)

"can't inject bad word at 0x%"PRIx64" in %s",

offset, pathrand);

- mutex_exit(&ztest_vdev_lock);

if (ztest_opts.zo_verbose >= 7)

(void) printf("injected bad word into %s,"

" offset 0x%"PRIx64"\n", pathrand, offset);

+ injected = B_TRUE;

}

(void) close(fd);

out:

+ mutex_exit(&ztest_vdev_lock);

+ if (injected && ztest_opts.zo_raid_do_expand) {

+ int error = spa_scan(spa, POOL_SCAN_SCRUB);

+ if (error == 0) {

+ while (dsl_scan_scrubbing(spa_get_dsl(spa)))

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ }

umem_free(path0, MAXPATHLEN);

umem_free(pathrand, MAXPATHLEN);

}

@@ -7194,6 +7556,75 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id)

(double)functime / NANOSEC, zi->zi_funcname);

}

+typedef struct ztest_raidz_expand_io {

+ uint64_t rzx_id;

+ uint64_t rzx_amount;

+ uint64_t rzx_bufsize;

+ const void *rzx_buffer;

+ uint64_t rzx_alloc_max;

+ spa_t *rzx_spa;

+} ztest_expand_io_t;

+#undef OD_ARRAY_SIZE

+#define OD_ARRAY_SIZE 10

+/*

+ * Write a request amount of data to some dataset objects.

+ * There will be ztest_opts.zo_threads count of these running in parallel.

+ */

+static __attribute__((noreturn)) void

+ztest_rzx_thread(void *arg)

+ ztest_expand_io_t *info = (ztest_expand_io_t *)arg;

+ ztest_od_t *od;

+ int batchsize;

+ int od_size;

+ ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets];

+ spa_t *spa = info->rzx_spa;

+ od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;

+ od = umem_alloc(od_size, UMEM_NOFAIL);

+ batchsize = OD_ARRAY_SIZE;

+ /* Create objects to write to */

+ for (int b = 0; b < batchsize; b++) {

+ ztest_od_init(od + b, info->rzx_id, FTAG, b,

+ DMU_OT_UINT64_OTHER, 0, 0, 0);

+ }

+ if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) {

+ umem_free(od, od_size);

+ thread_exit();

+ }

+ for (uint64_t offset = 0, written = 0; written < info->rzx_amount;

+ offset += info->rzx_bufsize) {

+ /* write to 10 objects */

+ for (int i = 0; i < batchsize && written < info->rzx_amount;

+ i++) {

+ (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock);

+ ztest_write(zd, od[i].od_object, offset,

+ info->rzx_bufsize, info->rzx_buffer);

+ (void) pthread_rwlock_unlock(&zd->zd_zilog_lock);

+ written += info->rzx_bufsize;

+ }

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ /* due to inflation, we'll typically bail here */

+ if (metaslab_class_get_alloc(spa_normal_class(spa)) >

+ info->rzx_alloc_max) {

+ break;

+ }

+ /* Remove a few objects to leave some holes in allocation space */

+ mutex_enter(&zd->zd_dirobj_lock);

+ (void) ztest_remove(zd, od, 2);

+ mutex_exit(&zd->zd_dirobj_lock);

+ umem_free(od, od_size);

+ thread_exit();

static __attribute__((noreturn)) void

ztest_thread(void *arg)

{

@@ -7209,8 +7640,10 @@ ztest_thread(void *arg)

* See if it's time to force a crash.

- if (now > zs->zs_thread_kill)

+ if (now > zs->zs_thread_kill &&

+ raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) {

ztest_kill(zs);

+ }

* If we're getting ENOSPC with some regularity, stop.

@@ -7400,9 +7833,14 @@ ztest_freeze(void)

spa_t *spa;

int numloops = 0;

+ /* freeze not supported during RAIDZ expansion */

+ if (ztest_opts.zo_raid_do_expand)

+ return;

if (ztest_opts.zo_verbose >= 3)

(void) printf("testing spa_freeze()...\n");

+ raidz_scratch_verify();

kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);

VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));

VERIFY0(ztest_dataset_open(0));

@@ -7470,6 +7908,7 @@ ztest_freeze(void)

* Open and close the pool and dataset to induce log replay.

+ raidz_scratch_verify();

kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);

VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));

ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX);

@@ -7519,6 +7958,7 @@ ztest_import(ztest_shared_t *zs)

mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);

VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));

+ raidz_scratch_verify();

kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);

ztest_import_impl();

@@ -7543,7 +7983,291 @@ ztest_import(ztest_shared_t *zs)

}

- * Kick off threads to run tests on all datasets in parallel.

+ * After the expansion was killed, check that the pool is healthy

+ */

+static void

+ztest_raidz_expand_check(spa_t *spa)

+ ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED);

+ /*

+ * Set pool check done flag, main program will run a zdb check

+ * of the pool when we exit.

+ */

+ ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED;

+ /* Wait for reflow to finish */

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("\nwaiting for reflow to finish ...\n");

+ }

+ pool_raidz_expand_stat_t rzx_stats;

+ pool_raidz_expand_stat_t *pres = &rzx_stats;

+ do {

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ (void) poll(NULL, 0, 500); /* wait 1/2 second */

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ (void) spa_raidz_expand_get_stats(spa, pres);

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ } while (pres->pres_state != DSS_FINISHED &&

+ pres->pres_reflowed < pres->pres_to_reflow);

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("verifying an interrupted raidz "

+ "expansion using a pool scrub ...\n");

+ }

+ /* Will fail here if there is non-recoverable corruption detected */

+ VERIFY0(ztest_scrub_impl(spa));

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("raidz expansion scrub check complete\n");

+ }

+/*

+ * Start a raidz expansion test. We run some I/O on the pool for a while

+ * to get some data in the pool. Then we grow the raidz and

+ * kill the test at the requested offset into the reflow, verifying that

+ * doing such does not lead to pool corruption.

+ */

+static void

+ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa)

+ nvlist_t *root;

+ pool_raidz_expand_stat_t rzx_stats;

+ pool_raidz_expand_stat_t *pres = &rzx_stats;

+ kthread_t **run_threads;

+ vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0];

+ int total_disks = rzvd->vdev_children;

+ int data_disks = total_disks - vdev_get_nparity(rzvd);

+ uint64_t alloc_goal;

+ uint64_t csize;

+ int error, t;

+ int threads = ztest_opts.zo_threads;

+ ztest_expand_io_t *thread_args;

+ ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE);

+ ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops);

+ ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED;

+ /* Setup a 1 MiB buffer of random data */

+ uint64_t bufsize = 1024 * 1024;

+ void *buffer = umem_alloc(bufsize, UMEM_NOFAIL);

+ if (read(ztest_fd_rand, buffer, bufsize) != bufsize) {

+ fatal(B_TRUE, "short read from /dev/urandom");

+ }

+ /*

+ * Put some data in the pool and then attach a vdev to initiate

+ * reflow.

+ */

+ run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL);

+ thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t),

+ UMEM_NOFAIL);

+ /* Aim for roughly 25% of allocatable space up to 1GB */

+ alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks;

+ alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024);

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("adding data to pool '%s', goal %llu bytes\n",

+ ztest_opts.zo_pool, (u_longlong_t)alloc_goal);

+ }

+ /*

+ * Kick off all the I/O generators that run in parallel.

+ */

+ for (t = 0; t < threads; t++) {

+ if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {

+ umem_free(run_threads, threads * sizeof (kthread_t *));

+ umem_free(buffer, bufsize);

+ return;

+ }

+ thread_args[t].rzx_id = t;

+ thread_args[t].rzx_amount = alloc_goal / threads;

+ thread_args[t].rzx_bufsize = bufsize;

+ thread_args[t].rzx_buffer = buffer;

+ thread_args[t].rzx_alloc_max = alloc_goal;

+ thread_args[t].rzx_spa = spa;

+ run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread,

+ &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE,

+ defclsyspri);

+ }

+ /*

+ * Wait for all of the writers to complete.

+ */

+ for (t = 0; t < threads; t++)

+ VERIFY0(thread_join(run_threads[t]));

+ /*

+ * Close all datasets. This must be done after all the threads

+ * are joined so we can be sure none of the datasets are in-use

+ * by any of the threads.

+ */

+ for (t = 0; t < ztest_opts.zo_threads; t++) {

+ if (t < ztest_opts.zo_datasets)

+ ztest_dataset_close(t);

+ }

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));

+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));

+ umem_free(buffer, bufsize);

+ umem_free(run_threads, threads * sizeof (kthread_t *));

+ umem_free(thread_args, threads * sizeof (ztest_expand_io_t));

+ /* Set our reflow target to 25%, 50% or 75% of allocated size */

+ uint_t multiple = ztest_random(3) + 1;

+ uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4;

+ raidz_expand_max_reflow_bytes = reflow_max;

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("running raidz expansion test, killing when "

+ "reflow reaches %llu bytes (%u/4 of allocated space)\n",

+ (u_longlong_t)reflow_max, multiple);

+ }

+ /* XXX - do we want some I/O load during the reflow? */

+ /*

+ * Use a disk size that is larger than existing ones

+ */

+ cvd = rzvd->vdev_child[0];

+ csize = vdev_get_min_asize(cvd);

+ csize += csize / 10;

+ /*

+ * Path to vdev to be attached

+ */

+ char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);

+ (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,

+ ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children);

+ /*

+ * Build the nvlist describing newpath.

+ */

+ root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(),

+ NULL, 0, 0, 1);

+ /*

+ * Expand the raidz vdev by attaching the new disk

+ */

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("expanding raidz: %d wide to %d wide with '%s'\n",

+ (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1,

+ newpath);

+ }

+ error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE);

+ nvlist_free(root);

+ if (error != 0) {

+ fatal(0, "raidz expand: attach (%s %llu) returned %d",

+ newpath, (long long)csize, error);

+ }

+ /*

+ * Wait for reflow to begin

+ */

+ while (spa->spa_raidz_expand == NULL) {

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ (void) poll(NULL, 0, 100); /* wait 1/10 second */

+ }

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ (void) spa_raidz_expand_get_stats(spa, pres);

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ while (pres->pres_state != DSS_SCANNING) {

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ (void) poll(NULL, 0, 100); /* wait 1/10 second */

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ (void) spa_raidz_expand_get_stats(spa, pres);

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ }

+ ASSERT3U(pres->pres_state, ==, DSS_SCANNING);

+ ASSERT3U(pres->pres_to_reflow, !=, 0);

+ /*

+ * Set so when we are killed we go to raidz checking rather than

+ * restarting test.

+ */

+ ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED;

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("raidz expansion reflow started, waiting for "

+ "%llu bytes to be copied\n", (u_longlong_t)reflow_max);

+ }

+ /*

+ * Wait for reflow maximum to be reached and then kill the test

+ */

+ while (pres->pres_reflowed < reflow_max) {

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ (void) poll(NULL, 0, 100); /* wait 1/10 second */

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ (void) spa_raidz_expand_get_stats(spa, pres);

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ }

+ /* Reset the reflow pause before killing */

+ raidz_expand_max_reflow_bytes = 0;

+ if (ztest_opts.zo_verbose >= 1) {

+ (void) printf("killing raidz expansion test after reflow "

+ "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed);

+ }

+ /*

+ * Kill ourself to simulate a panic during a reflow. Our parent will

+ * restart the test and the changed flag value will drive the test

+ * through the scrub/check code to verify the pool is not corrupted.

+ */

+ ztest_kill(zs);

+static void

+ztest_generic_run(ztest_shared_t *zs, spa_t *spa)

+ kthread_t **run_threads;

+ int t;

+ run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),

+ UMEM_NOFAIL);

+ /*

+ * Kick off all the tests that run in parallel.

+ */

+ for (t = 0; t < ztest_opts.zo_threads; t++) {

+ if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {

+ umem_free(run_threads, ztest_opts.zo_threads *

+ sizeof (kthread_t *));

+ return;

+ }

+ run_threads[t] = thread_create(NULL, 0, ztest_thread,

+ (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,

+ defclsyspri);

+ }

+ /*

+ * Wait for all of the tests to complete.

+ */

+ for (t = 0; t < ztest_opts.zo_threads; t++)

+ VERIFY0(thread_join(run_threads[t]));

+ /*

+ * Close all datasets. This must be done after all the threads

+ * are joined so we can be sure none of the datasets are in-use

+ * by any of the threads.

+ */

+ for (t = 0; t < ztest_opts.zo_threads; t++) {

+ if (t < ztest_opts.zo_datasets)

+ ztest_dataset_close(t);

+ }

+ txg_wait_synced(spa_get_dsl(spa), 0);

+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));

+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));

+ umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));

+/*

+ * Setup our test context and kick off threads to run tests on all datasets

+ * in parallel.

static void

ztest_run(ztest_shared_t *zs)

@@ -7551,7 +8275,6 @@ ztest_run(ztest_shared_t *zs)

spa_t *spa;

objset_t *os;

kthread_t *resume_thread, *deadman_thread;

- kthread_t **run_threads;

uint64_t object;

int error;

int t, d;

@@ -7584,6 +8307,7 @@ ztest_run(ztest_shared_t *zs)

* Open our pool. It may need to be imported first depending on

* what tests were running when the previous pass was terminated.

+ raidz_scratch_verify();

kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);

error = spa_open(ztest_opts.zo_pool, &spa, FTAG);

if (error) {

@@ -7597,7 +8321,11 @@ ztest_run(ztest_shared_t *zs)

metaslab_preload_limit = ztest_random(20) + 1;

ztest_spa = spa;

- VERIFY0(vdev_raidz_impl_set("cycle"));

+ /*

+ * XXX - BUGBUG raidz expansion do not run this for generic for now

+ */

+ if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE)

+ VERIFY0(vdev_raidz_impl_set("cycle"));

dmu_objset_stats_t dds;

VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,

@@ -7607,6 +8335,10 @@ ztest_run(ztest_shared_t *zs)

dsl_pool_config_exit(dmu_objset_pool(os), FTAG);

dmu_objset_disown(os, B_TRUE, FTAG);

+ /* Give the dedicated raidz expansion test more grace time */

+ if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE)

+ zfs_deadman_synctime_ms *= 2;

* Create a thread to periodically resume suspended I/O.

@@ -7640,6 +8372,10 @@ ztest_run(ztest_shared_t *zs)

* If we got any ENOSPC errors on the previous run, destroy something.

if (zs->zs_enospc_count != 0) {

+ /* Not expecting ENOSPC errors during raidz expansion tests */

+ ASSERT3U(ztest_opts.zo_raidz_expand_test, ==,

+ RAIDZ_EXPAND_NONE);

int d = ztest_random(ztest_opts.zo_datasets);

ztest_dataset_destroy(d);

}

@@ -7654,9 +8390,12 @@ ztest_run(ztest_shared_t *zs)

* that we always run the scrub whenever an indirect vdev exists

* because we have no way of knowing for sure if ztest_device_removal()

* fully completed its scrub before the pool was reimported.

+ *

+ * Does not apply for the RAIDZ expansion specific test runs

- if (spa->spa_removing_phys.sr_state == DSS_SCANNING ||

- spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {

+ if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE &&

+ (spa->spa_removing_phys.sr_state == DSS_SCANNING ||

+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) {

while (spa->spa_removing_phys.sr_state == DSS_SCANNING)

txg_wait_synced(spa_get_dsl(spa), 0);

@@ -7666,9 +8405,6 @@ ztest_run(ztest_shared_t *zs)

ASSERT0(error);

}

- run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),

- UMEM_NOFAIL);

if (ztest_opts.zo_verbose >= 4)

(void) printf("starting main threads...\n");

@@ -7681,43 +8417,12 @@ ztest_run(ztest_shared_t *zs)

(void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb,

NULL, DS_FIND_CHILDREN);

- /*

- * Kick off all the tests that run in parallel.

- */

- for (t = 0; t < ztest_opts.zo_threads; t++) {

- if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {

- umem_free(run_threads, ztest_opts.zo_threads *

- sizeof (kthread_t *));

- return;

- }

- run_threads[t] = thread_create(NULL, 0, ztest_thread,

- (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,

- defclsyspri);

- }

- /*

- * Wait for all of the tests to complete.

- */

- for (t = 0; t < ztest_opts.zo_threads; t++)

- VERIFY0(thread_join(run_threads[t]));

- /*

- * Close all datasets. This must be done after all the threads

- * are joined so we can be sure none of the datasets are in-use

- * by any of the threads.

- */

- for (t = 0; t < ztest_opts.zo_threads; t++) {

- if (t < ztest_opts.zo_datasets)

- ztest_dataset_close(t);

- }

- txg_wait_synced(spa_get_dsl(spa), 0);

- zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));

- zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));

- umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));

+ if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED)

+ ztest_raidz_expand_run(zs, spa);

+ else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED)

+ ztest_raidz_expand_check(spa);

+ else

+ ztest_generic_run(zs, spa);

/* Kill the resume and deadman threads */

ztest_exiting = B_TRUE;

@@ -7826,6 +8531,7 @@ ztest_init(ztest_shared_t *zs)

mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);

VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));

+ raidz_scratch_verify();

kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);

@@ -7911,6 +8617,7 @@ shared_data_size(ztest_shared_hdr_t *hdr)

size += hdr->zh_size;

size += hdr->zh_stats_size * hdr->zh_stats_count;

size += hdr->zh_ds_size * hdr->zh_ds_count;

+ size += hdr->zh_scratch_state_size;

return (size);

}

@@ -7934,6 +8641,7 @@ setup_hdr(void)

hdr->zh_stats_count = ZTEST_FUNCS;

hdr->zh_ds_size = sizeof (ztest_shared_ds_t);

hdr->zh_ds_count = ztest_opts.zo_datasets;

+ hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t);

size = shared_data_size(hdr);

VERIFY0(ftruncate(ztest_fd_data, size));

@@ -7968,6 +8676,8 @@ setup_data(void)

ztest_shared_callstate = (void *)&buf[offset];

offset += hdr->zh_stats_size * hdr->zh_stats_count;

ztest_shared_ds = (void *)&buf[offset];

+ offset += hdr->zh_ds_size * hdr->zh_ds_count;

+ ztest_scratch_state = (void *)&buf[offset];

}

static boolean_t

@@ -8188,13 +8898,14 @@ main(int argc, char **argv)

hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);

if (ztest_opts.zo_verbose >= 1) {

- (void) printf("%"PRIu64" vdevs, %d datasets, %d threads,"

- "%d %s disks, %"PRIu64" seconds...\n\n",

+ (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, "

+ "%d %s disks, parity %d, %"PRIu64" seconds...\n\n",

ztest_opts.zo_vdevs,

ztest_opts.zo_datasets,

ztest_opts.zo_threads,

ztest_opts.zo_raid_children,

ztest_opts.zo_raid_type,

+ ztest_opts.zo_raid_parity,

ztest_opts.zo_time);

}

@@ -8307,6 +9018,9 @@ main(int argc, char **argv)

if (!ztest_opts.zo_mmp_test)

ztest_run_zdb(zs->zs_guid);

+ if (ztest_shared_opts->zo_raidz_expand_test ==

+ RAIDZ_EXPAND_CHECKED)

+ break; /* raidz expand test complete */

}

if (ztest_opts.zo_verbose >= 1) {