diff options
author | Martin Matuska <mm@FreeBSD.org> | 2023-11-09 10:42:33 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2023-11-09 12:19:17 +0000 |
commit | e716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch) | |
tree | 3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/cmd | |
parent | f5b3e686292b6502878c64c3c154908024e06eb6 (diff) | |
parent | 887a3c533b94a4b70075e310f15c45b9dee19410 (diff) |
zfs: merge openzfs/zfs@887a3c533
Notable upstream pull request merges:
#15022 5caeef02f RAID-Z expansion feature
#15457 887a3c533 Increase L2ARC write rate and headroom
#15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384
Obtained from: OpenZFS
OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410
Diffstat (limited to 'sys/contrib/openzfs/cmd')
-rw-r--r-- | sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c | 12 | ||||
-rw-r--r-- | sys/contrib/openzfs/cmd/raidz_test/raidz_test.c | 196 | ||||
-rw-r--r-- | sys/contrib/openzfs/cmd/raidz_test/raidz_test.h | 3 | ||||
-rw-r--r-- | sys/contrib/openzfs/cmd/zdb/zdb.c | 5 | ||||
-rw-r--r-- | sys/contrib/openzfs/cmd/zpool/zpool_main.c | 148 | ||||
-rw-r--r-- | sys/contrib/openzfs/cmd/ztest.c | 912 |
6 files changed, 963 insertions, 313 deletions
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c index 8be08558b36d..730e6e1a040b 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c @@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, rto_opts.rto_ashift, ncols+1, ncols, - fn+1, rto_opts.rto_expand_offset); + fn+1, rto_opts.rto_expand_offset, + 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, fn+1); @@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, BENCH_ASHIFT, ncols+1, ncols, - PARITY_PQR, rto_opts.rto_expand_offset); + PARITY_PQR, + rto_opts.rto_expand_offset, 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, PARITY_PQR); diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c index 195026d3a7ab..6a018ecf0737 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c @@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) if (opts->rto_expand) { opts->rm_golden = - vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, - opts->zio_golden->io_size, opts->zio_golden->io_offset, + vdev_raidz_map_alloc_expanded(opts->zio_golden, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); - rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, - zio_test->io_size, zio_test->io_offset, + parity, opts->rto_expand_offset, 0, B_FALSE); + rm_test = vdev_raidz_map_alloc_expanded(zio_test, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, opts->rto_ashift, total_ncols, parity); @@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } -/* - * If reflow is not in progress, reflow_offset should be UINT64_MAX. - * For each row, if the row is entirely before reflow_offset, it will - * come from the new location. Otherwise this row will come from the - * old location. Therefore, rows that straddle the reflow_offset will - * come from the old location. - * - * NOTE: Until raidz expansion is implemented this function is only - * needed by raidz_test.c to the multi-row raid_map_t functionality. - */ -raidz_map_t * -vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, - uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, - uint64_t nparity, uint64_t reflow_offset) -{ - /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> ashift; - uint64_t q, r, bc, devidx, asize = 0, tot; - - /* - * "Quotient": The number of data sectors for this stripe on all but - * the "big column" child vdevs that also contain "remainder" data. - * AKA "full rows" - */ - q = s / (logical_cols - nparity); - - /* - * "Remainder": The number of partial stripe data sectors in this I/O. - * This will add a sector to some, but not all, child vdevs. - */ - r = s - q * (logical_cols - nparity); - - /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); - - /* - * The total number of data and parity sectors associated with - * this I/O. - */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - - /* How many rows contain data (not skip) */ - uint64_t rows = howmany(tot, logical_cols); - int cols = MIN(tot, logical_cols); - - raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), - KM_SLEEP); - rm->rm_nrows = rows; - - for (uint64_t row = 0; row < rows; row++) { - raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, - rr_col[cols]), KM_SLEEP); - rm->rm_row[row] = rr; - - /* The starting RAIDZ (parent) vdev sector of the row. */ - uint64_t b = (offset >> ashift) + row * logical_cols; - - /* - * If we are in the middle of a reflow, and any part of this - * row has not been copied, then use the old location of - * this row. - */ - int row_phys_cols = physical_cols; - if (b + (logical_cols - nparity) > reflow_offset >> ashift) - row_phys_cols--; - - /* starting child of this row */ - uint64_t child_id = b % row_phys_cols; - /* The starting byte offset on each child vdev. */ - uint64_t child_offset = (b / row_phys_cols) << ashift; - - /* - * We set cols to the entire width of the block, even - * if this row is shorter. This is needed because parity - * generation (for Q and R) needs to know the entire width, - * because it treats the short row as though it was - * full-width (and the "phantom" sectors were zero-filled). - * - * Another approach to this would be to set cols shorter - * (to just the number of columns that we might do i/o to) - * and have another mechanism to tell the parity generation - * about the "entire width". Reconstruction (at least - * vdev_raidz_reconstruct_general()) would also need to - * know about the "entire width". - */ - rr->rr_cols = cols; - rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; - rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; - - for (int c = 0; c < rr->rr_cols; c++, child_id++) { - if (child_id >= row_phys_cols) { - child_id -= row_phys_cols; - child_offset += 1ULL << ashift; - } - rr->rr_col[c].rc_devidx = child_id; - rr->rr_col[c].rc_offset = child_offset; - rr->rr_col[c].rc_orig_data = NULL; - rr->rr_col[c].rc_error = 0; - rr->rr_col[c].rc_tried = 0; - rr->rr_col[c].rc_skipped = 0; - rr->rr_col[c].rc_need_orig_restore = B_FALSE; - - uint64_t dc = c - rr->rr_firstdatacol; - if (c < rr->rr_firstdatacol) { - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = - abd_alloc_linear(rr->rr_col[c].rc_size, - B_TRUE); - } else if (row == rows - 1 && bc != 0 && c >= bc) { - /* - * Past the end, this for parity generation. - */ - rr->rr_col[c].rc_size = 0; - rr->rr_col[c].rc_abd = NULL; - } else { - /* - * "data column" (col excluding parity) - * Add an ASCII art diagram here - */ - uint64_t off; - - if (c < bc || r == 0) { - off = dc * rows + row; - } else { - off = r * rows + - (dc - r) * (rows - 1) + row; - } - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = abd_get_offset_struct( - &rr->rr_col[c].rc_abdstruct, - abd, off << ashift, 1 << ashift); - } - - asize += rr->rr_col[c].rc_size; - } - /* - * If all data stored spans all columns, there's a danger that - * parity will always be on the same device and, since parity - * isn't read during normal operation, that that device's I/O - * bandwidth won't be used effectively. We therefore switch - * the parity every 1MB. - * - * ...at least that was, ostensibly, the theory. As a practical - * matter unless we juggle the parity between all devices - * evenly, we won't see any benefit. Further, occasional writes - * that aren't a multiple of the LCM of the number of children - * and the minimum stripe width are sufficient to avoid pessimal - * behavior. Unfortunately, this decision created an implicit - * on-disk format requirement that we need to support for all - * eternity, but only for single-parity RAID-Z. - * - * If we intend to skip a sector in the zeroth column for - * padding we must make sure to note this swap. We will never - * intend to skip the first column since at least one data and - * one parity column must appear in each row. - */ - if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && - (offset & (1ULL << 20))) { - ASSERT(rr->rr_cols >= 2); - ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - devidx = rr->rr_col[0].rc_devidx; - uint64_t o = rr->rr_col[0].rc_offset; - rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; - rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; - rr->rr_col[1].rc_devidx = devidx; - rr->rr_col[1].rc_offset = o; - } - - } - ASSERT3U(asize, ==, tot << ashift); - - /* init RAIDZ parity ops */ - rm->rm_ops = vdev_raidz_math_get_ops(); - - return (rm); -} - static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) init_zio_abd(*zio); if (opts->rto_expand) { - rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, - (*zio)->io_size, (*zio)->io_offset, + rm = vdev_raidz_map_alloc_expanded(*zio, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h index 163929defc73..f912e281f6f3 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h @@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); -struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); - #endif /* RAIDZ_TEST_H */ diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 3c282f3fc975..18221c4b92d2 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -4134,6 +4134,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); + + (void) printf("\traidz_reflow state=%u off=%llu\n", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + (void) printf("%s", footer ? footer : ""); } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 5507f9d3fd67..9dd1d2109004 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -6650,9 +6650,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); - if (ret == 0 && wait) - ret = zpool_wait(zhp, - replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); + if (ret == 0 && wait) { + zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; + char raidz_prefix[] = "raidz"; + if (replacing) { + activity = ZPOOL_WAIT_REPLACE; + } else if (strncmp(old_disk, + raidz_prefix, strlen(raidz_prefix)) == 0) { + activity = ZPOOL_WAIT_RAIDZ_EXPAND; + } + ret = zpool_wait(zhp, activity); + } nvlist_free(props); nvlist_free(nvroot); @@ -6678,17 +6686,21 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device> + * zpool attach [-fsw] [-o property=value] <pool> <device>|<vdev> <new_device> * * -f Force attach, even if <new_device> appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. - * -w Wait for resilvering to complete before returning + * -w Wait for resilvering (mirror) or expansion (raidz) to complete + * before returning. * - * Attach <new_device> to the mirror containing <device>. If <device> is not - * part of a mirror, then <device> will be transformed into a mirror of - * <device> and <new_device>. In either case, <new_device> will begin life - * with a DTL of [0, now], and will immediately begin to resilver itself. + * Attach <new_device> to a <device> or <vdev>, where the vdev can be of type + * mirror or raidz. If <device> is not part of a mirror, then <device> will + * be transformed into a mirror of <device> and <new_device>. When a mirror + * is involved, <new_device> will begin life with a DTL of [0, now], and will + * immediately begin to resilver itself. For the raidz case, a expansion will + * commence and reflow the raidz data across all the disks including the + * <new_device>. */ int zpool_do_attach(int argc, char **argv) @@ -8195,6 +8207,97 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) } } +/* + * Print out detailed raidz expansion status. + */ +static void +print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) +{ + char copied_buf[7]; + + if (pres == NULL || pres->pres_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + nvlist_t **child; + uint_t children; + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(pres->pres_expanding_vdev < children); + + printf_color(ANSI_BOLD, gettext("expand: ")); + + time_t start = pres->pres_start_time; + time_t end = pres->pres_end_time; + char *vname = + zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0); + zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf)); + + /* + * Expansion is finished or canceled. + */ + if (pres->pres_state == DSS_FINISHED) { + char time_buf[32]; + secs_to_dhms(end - start, time_buf); + + (void) printf(gettext("expanded %s-%u copied %s in %s, " + "on %s"), vname, (int)pres->pres_expanding_vdev, + copied_buf, time_buf, ctime((time_t *)&end)); + } else { + char examined_buf[7], total_buf[7], rate_buf[7]; + uint64_t copied, total, elapsed, secs_left; + double fraction_done; + uint_t rate; + + assert(pres->pres_state == DSS_SCANNING); + + /* + * Expansion is in progress. + */ + (void) printf(gettext( + "expansion of %s-%u in progress since %s"), + vname, (int)pres->pres_expanding_vdev, ctime(&start)); + + copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1; + total = pres->pres_to_reflow; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - pres->pres_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + secs_left = (total - copied) / rate; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (pres->pres_waiting_for_resilver) { + (void) printf(gettext(", paused for resilver or " + "clear\n")); + } else if (secs_left < (30 * 24 * 3600)) { + char time_buf[32]; + secs_to_dhms(secs_left, time_buf); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } + free(vname); +} static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { @@ -8772,19 +8875,24 @@ status_callback(zpool_handle_t *zhp, void *data) uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - pool_checkpoint_stat_t *pcs = NULL; - pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); + pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); + pool_checkpoint_stat_t *pcs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); + pool_raidz_expand_stat_t *pres = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + print_raidz_expand_status(zhp, pres); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) @@ -10738,8 +10846,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; + pool_raidz_expand_stat_t *pres = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", - "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -10798,6 +10907,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) vdev_activity_top_remaining(nvroot); } + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + if (pres != NULL && pres->pres_state == DSS_SCANNING) { + int64_t rem = pres->pres_to_reflow - pres->pres_reflowed; + bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -10827,11 +10943,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) if (!wd->wd_enabled[i]) continue; - if (wd->wd_exact) + if (wd->wd_exact) { (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); - else + } else { zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + } if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); @@ -10937,7 +11054,8 @@ zpool_do_wait(int argc, char **argv) for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "discard", "free", "initialize", "replace", - "remove", "resilver", "scrub", "trim" }; + "remove", "resilver", "scrub", "trim", + "raidz_expand" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c index 8cfbdfe1c2e2..1d414a9f6fd5 100644 --- a/sys/contrib/openzfs/cmd/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest.c @@ -151,6 +151,7 @@ typedef struct ztest_shared_hdr { uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; + uint64_t zh_scratch_state_size; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; @@ -161,6 +162,16 @@ enum ztest_class_state { ZTEST_VDEV_CLASS_RND }; +/* Dedicated RAIDZ Expansion test states */ +typedef enum { + RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ + RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ + RAIDZ_EXPAND_STARTED, /* Testing has commenced */ + RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ + RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ +} raidz_expand_test_state_t; + + #define ZO_GVARS_MAX_ARGLEN ((size_t)64) #define ZO_GVARS_MAX_COUNT ((size_t)10) @@ -174,6 +185,7 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; + int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; @@ -188,6 +200,7 @@ typedef struct ztest_shared_opts { uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; + raidz_expand_test_state_t zo_raidz_expand_test; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; @@ -249,6 +262,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, + .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, }; extern uint64_t metaslab_force_ganging; @@ -261,6 +275,8 @@ extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; +extern uint64_t raidz_expand_max_reflow_bytes; +extern uint_t raidz_expand_pause_point; static ztest_shared_opts_t *ztest_shared_opts; @@ -274,6 +290,12 @@ typedef struct ztest_shared_ds { static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) +typedef struct ztest_scratch_state { + uint64_t zs_raidz_scratch_verify_pause; +} ztest_shared_scratch_state_t; + +static ztest_shared_scratch_state_t *ztest_scratch_state; + #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) @@ -311,9 +333,9 @@ typedef struct bufwad { * still need to map from object ID to rangelock_t. */ typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND + ZTRL_READER, + ZTRL_WRITER, + ZTRL_APPEND } rl_type_t; typedef struct rll { @@ -408,6 +430,7 @@ ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_raidz_attach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; @@ -465,6 +488,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), + ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), @@ -745,7 +769,7 @@ static ztest_option_t option_table[] = { DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, - { 'K', "raid-kind", "raidz|draid|random", "Raid kind", + { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, @@ -781,6 +805,9 @@ static ztest_option_t option_table[] = { NO_DEFAULT, NULL}, { 'C', "vdev-class-state", "on|off|random", "vdev class state", NO_DEFAULT, "random"}, + { 'X', "raidz-expansion", NULL, + "Perform a dedicated raidz expansion test", + NO_DEFAULT, NULL}, { 'o', "option", "\"OPTION=INTEGER\"", "Set global variable to an unsigned 32-bit integer value", NO_DEFAULT, NULL}, @@ -853,7 +880,7 @@ usage(boolean_t requested) option_table[i].short_opt, option_table[i].long_opt); } - (void) fprintf(fp, " %-40s%s", option, + (void) fprintf(fp, " %-43s%s", option, option_table[i].comment); if (option_table[i].long_opt_param != NULL) { @@ -1027,6 +1054,9 @@ process_options(int argc, char **argv) case 'V': zo->zo_verbose++; break; + case 'X': + zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; + break; case 'E': zo->zo_init = 0; break; @@ -1078,9 +1108,28 @@ process_options(int argc, char **argv) fini_options(); - /* When raid choice is 'random' add a draid pool 50% of the time */ + /* Force compatible options for raidz expansion run */ + if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { + zo->zo_mmp_test = 0; + zo->zo_mirrors = 0; + zo->zo_vdevs = 1; + zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; + zo->zo_raid_do_expand = B_FALSE; + raid_kind = "raidz"; + } + if (strcmp(raid_kind, "random") == 0) { - raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; + switch (ztest_random(3)) { + case 0: + raid_kind = "raidz"; + break; + case 1: + raid_kind = "eraidz"; + break; + case 2: + raid_kind = "draid"; + break; + } if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); @@ -1119,6 +1168,18 @@ process_options(int argc, char **argv) (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); + } else if (strcmp(raid_kind, "eraidz") == 0) { + /* using eraidz (expandable raidz) */ + zo->zo_raid_do_expand = B_TRUE; + + /* tests expect top-level to be raidz */ + zo->zo_mirrors = 0; + zo->zo_vdevs = 1; + + /* Make sure parity is less than data columns */ + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); @@ -1166,9 +1227,29 @@ ztest_kill(ztest_shared_t *zs) * Before we kill ourselves, make sure that the config is updated. * See comment above spa_write_cachefile(). */ - mutex_enter(&spa_namespace_lock); - spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); - mutex_exit(&spa_namespace_lock); + if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { + if (mutex_tryenter(&spa_namespace_lock)) { + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, + B_FALSE); + mutex_exit(&spa_namespace_lock); + + ztest_scratch_state->zs_raidz_scratch_verify_pause = + raidz_expand_pause_point; + } else { + /* + * Do not verify scratch object in case if + * spa_namespace_lock cannot be acquired, + * it can cause deadlock in spa_config_update(). + */ + raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; + + return; + } + } else { + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); + mutex_exit(&spa_namespace_lock); + } (void) raise(SIGKILL); } @@ -1615,7 +1696,7 @@ ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); - if (type == RL_READER) { + if (type == ZTRL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; @@ -2071,7 +2152,7 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); - ztest_object_lock(zd, object, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); @@ -2141,8 +2222,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (bt->bt_magic != BT_MAGIC) bt = NULL; - ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2245,9 +2326,9 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_READER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, - RL_WRITER); + ZTRL_WRITER); tx = dmu_tx_create(os); @@ -2287,7 +2368,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2414,7 +2495,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(lwb, !=, NULL); ASSERT3U(size, !=, 0); - ztest_object_lock(zd, object, RL_READER); + ztest_object_lock(zd, object, ZTRL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); @@ -2439,7 +2520,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -2455,7 +2536,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_buf_hold_noread(os, object, offset, zgd, &db); @@ -2531,7 +2612,7 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ - ztest_object_lock(zd, od->od_object, RL_READER); + ztest_object_lock(zd, od->od_object, ZTRL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); @@ -2634,7 +2715,7 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, - void *data) + const void *data) { lr_write_t *lr; int error; @@ -2704,8 +2785,8 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) txg_wait_synced(dmu_objset_pool(os), 0); - ztest_object_lock(zd, object, RL_READER); - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_READER); + rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); tx = dmu_tx_create(os); @@ -3033,13 +3114,32 @@ ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_CONFIG, FTAG); } +static int +ztest_get_raidz_children(spa_t *spa) +{ + (void) spa; + vdev_t *raidvd; + + ASSERT(MUTEX_HELD(&ztest_vdev_lock)); + + if (ztest_opts.zo_raid_do_expand) { + raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; + + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + return (raidvd->vdev_children); + } + + return (ztest_opts.zo_raid_children); +} + void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; - uint64_t version, newversion; + uint64_t raidz_children, version, newversion; nvlist_t *nvroot, *props; char *name; @@ -3058,8 +3158,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) */ (void) spa_destroy(name); + raidz_children = ztest_get_raidz_children(ztest_spa); + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); + NULL, raidz_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the @@ -3125,6 +3227,7 @@ ztest_spa_checkpoint(spa_t *spa) case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: break; case ENOSPC: ztest_record_enospc(FTAG); @@ -3205,6 +3308,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; + uint64_t raidz_children; + nvlist_t *nvroot; int error; @@ -3212,8 +3317,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3267,7 +3372,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + "log" : NULL, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); @@ -3295,6 +3400,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; + uint64_t raidz_children; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; @@ -3322,15 +3428,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); + class, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); @@ -3592,6 +3698,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; + uint64_t raidz_children; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; @@ -3608,7 +3715,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3624,6 +3732,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) } /* + * RAIDZ leaf VDEV mirrors are not currently supported while a + * RAIDZ expansion is in progress. + */ + if (ztest_opts.zo_raid_do_expand) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + /* * Decide whether to do an attach or a replace. */ replacing = ztest_random(2); @@ -3647,7 +3764,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf / raidz_children]; } /* pick a child out of the raidz group */ @@ -3656,8 +3773,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); - ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf % raidz_children]; } /* @@ -3825,6 +3941,226 @@ out: umem_free(newpath, MAXPATHLEN); } +static void +raidz_scratch_verify(void) +{ + spa_t *spa; + uint64_t write_size, logical_size, offset; + raidz_reflow_scratch_state_t state; + vdev_raidz_expand_t *vre; + vdev_t *raidvd; + + ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); + + if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) + return; + + kernel_init(SPA_MODE_READ); + + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(ztest_opts.zo_pool); + ASSERT(spa); + spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; + mutex_exit(&spa_namespace_lock); + + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + + ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + + vre = spa->spa_raidz_expand; + if (vre == NULL) + goto out; + + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + state = RRSS_GET_STATE(&spa->spa_uberblock); + write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift); + logical_size = write_size * raidvd->vdev_children; + + switch (state) { + /* + * Initial state of reflow process. RAIDZ expansion was + * requested by user, but scratch object was not created. + */ + case RRSS_SCRATCH_NOT_IN_USE: + ASSERT3U(offset, ==, 0); + break; + + /* + * Scratch object was synced and stored in boot area. + */ + case RRSS_SCRATCH_VALID: + + /* + * Scratch object was synced back to raidz start offset, + * raidz is ready for sector by sector reflow process. + */ + case RRSS_SCRATCH_INVALID_SYNCED: + + /* + * Scratch object was synced back to raidz start offset + * on zpool importing, raidz is ready for sector by sector + * reflow process. + */ + case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: + ASSERT3U(offset, ==, logical_size); + break; + + /* + * Sector by sector reflow process started. + */ + case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: + ASSERT3U(offset, >=, logical_size); + break; + } + +out: + spa_config_exit(spa, SCL_ALL, FTAG); + + mutex_exit(&ztest_vdev_lock); + + ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; + + spa_close(spa, FTAG); + kernel_fini(); +} + +static void +ztest_scratch_thread(void *arg) +{ + (void) arg; + + /* wait up to 10 seconds */ + for (int t = 100; t > 0; t -= 1) { + if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) + thread_exit(); + + (void) poll(NULL, 0, 100); + } + + /* killed when the scratch area progress reached a certain point */ + ztest_kill(ztest_shared); +} + +/* + * Verify that we can attach raidz device. + */ +void +ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); + kthread_t *scratch_thread = NULL; + vdev_t *newvd, *pvd; + nvlist_t *root; + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + int error, expected_error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + + /* Only allow attach when raid-kind = 'eraidz' */ + if (!ztest_opts.zo_raid_do_expand) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_opts.zo_mmp_test) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + pvd = vdev_lookup_top(spa, 0); + + ASSERT(pvd->vdev_ops == &vdev_raidz_ops); + + /* + * Get size of a child of the raidz group, + * make sure device is a bit bigger + */ + newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; + newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); + + /* + * Get next attached leaf id + */ + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; + zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + + if (spa->spa_raidz_expand) + expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Path to vdev to be attached + */ + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, + 0, 0, 1); + + /* + * 50% of the time, set raidz_expand_pause_point to cause + * raidz_reflow_scratch_sync() to pause at a certain point and + * then kill the test after 10 seconds so raidz_scratch_verify() + * can confirm consistency when the pool is imported. + */ + if (ztest_random(2) == 0 && expected_error == 0) { + raidz_expand_pause_point = + ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; + scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, + ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + } + + error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); + + nvlist_free(root); + + if (error == EOVERFLOW || error == ENXIO || + error == ZFS_ERR_CHECKPOINT_EXISTS || + error == ZFS_ERR_DISCARDING_CHECKPOINT) + expected_error = error; + + if (error != 0 && error != expected_error) { + fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", + newpath, newsize, error, expected_error); + } + + if (raidz_expand_pause_point) { + if (error != 0) { + /* + * Do not verify scratch object in case of error + * returned by vdev attaching. + */ + raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; + } + + VERIFY0(thread_join(scratch_thread)); + } +out: + mutex_exit(&ztest_vdev_lock); + + umem_free(newpath, MAXPATHLEN); +} + void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { @@ -4031,6 +4367,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) return; } + /* + * If we are under raidz expansion, the test can failed because the + * metaslabs count will not increase immediately after the vdev is + * expanded. It will happen only after raidz expansion completion. + */ + if (spa->spa_raidz_expand) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; @@ -5815,7 +6163,7 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) dmu_object_info_t doi; dmu_buf_t *db; - ztest_object_lock(zd, obj, RL_READER); + ztest_object_lock(zd, obj, ZTRL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; @@ -6038,6 +6386,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; + uint64_t raidz_children; char *path0; char *pathrand; size_t fsize; @@ -6048,6 +6397,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; + boolean_t injected = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -6060,15 +6410,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ - if (ztest_device_removal_active) { - mutex_exit(&ztest_vdev_lock); + if (ztest_device_removal_active) + goto out; + + /* + * The fault injection strategy for damaging blocks cannot be used + * if raidz expansion is in progress. The leaves value + * (attached raidz children) is variable and strategy for damaging + * blocks will corrupt same data blocks on different child vdevs + * because of the reflow process. + */ + if (spa->spa_raidz_expand != NULL) goto out; - } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; mirror_save = zs->zs_mirrors; - mutex_exit(&ztest_vdev_lock); ASSERT3U(leaves, >=, 1); @@ -6209,13 +6567,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from - * doing so. We grab the ztest_vdev_lock here to - * prevent a race between injection testing and - * aux_vdev removal. + * doing so. */ - mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); - mutex_exit(&ztest_vdev_lock); } } @@ -6289,9 +6643,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; - mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { - mutex_exit(&ztest_vdev_lock); (void) close(fd); goto out; } @@ -6301,15 +6653,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) "can't inject bad word at 0x%"PRIx64" in %s", offset, pathrand); - mutex_exit(&ztest_vdev_lock); - if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%"PRIx64"\n", pathrand, offset); + + injected = B_TRUE; } (void) close(fd); out: + mutex_exit(&ztest_vdev_lock); + + if (injected && ztest_opts.zo_raid_do_expand) { + int error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error == 0) { + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + } + } + umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } @@ -7194,6 +7556,75 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id) (double)functime / NANOSEC, zi->zi_funcname); } +typedef struct ztest_raidz_expand_io { + uint64_t rzx_id; + uint64_t rzx_amount; + uint64_t rzx_bufsize; + const void *rzx_buffer; + uint64_t rzx_alloc_max; + spa_t *rzx_spa; +} ztest_expand_io_t; + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 10 + +/* + * Write a request amount of data to some dataset objects. + * There will be ztest_opts.zo_threads count of these running in parallel. + */ +static __attribute__((noreturn)) void +ztest_rzx_thread(void *arg) +{ + ztest_expand_io_t *info = (ztest_expand_io_t *)arg; + ztest_od_t *od; + int batchsize; + int od_size; + ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; + spa_t *spa = info->rzx_spa; + + od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(od_size, UMEM_NOFAIL); + batchsize = OD_ARRAY_SIZE; + + /* Create objects to write to */ + for (int b = 0; b < batchsize; b++) { + ztest_od_init(od + b, info->rzx_id, FTAG, b, + DMU_OT_UINT64_OTHER, 0, 0, 0); + } + if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { + umem_free(od, od_size); + thread_exit(); + } + + for (uint64_t offset = 0, written = 0; written < info->rzx_amount; + offset += info->rzx_bufsize) { + /* write to 10 objects */ + for (int i = 0; i < batchsize && written < info->rzx_amount; + i++) { + (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); + ztest_write(zd, od[i].od_object, offset, + info->rzx_bufsize, info->rzx_buffer); + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); + written += info->rzx_bufsize; + } + txg_wait_synced(spa_get_dsl(spa), 0); + /* due to inflation, we'll typically bail here */ + if (metaslab_class_get_alloc(spa_normal_class(spa)) > + info->rzx_alloc_max) { + break; + } + } + + /* Remove a few objects to leave some holes in allocation space */ + mutex_enter(&zd->zd_dirobj_lock); + (void) ztest_remove(zd, od, 2); + mutex_exit(&zd->zd_dirobj_lock); + + umem_free(od, od_size); + + thread_exit(); +} + static __attribute__((noreturn)) void ztest_thread(void *arg) { @@ -7209,8 +7640,10 @@ ztest_thread(void *arg) /* * See if it's time to force a crash. */ - if (now > zs->zs_thread_kill) + if (now > zs->zs_thread_kill && + raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { ztest_kill(zs); + } /* * If we're getting ENOSPC with some regularity, stop. @@ -7400,9 +7833,14 @@ ztest_freeze(void) spa_t *spa; int numloops = 0; + /* freeze not supported during RAIDZ expansion */ + if (ztest_opts.zo_raid_do_expand) + return; + if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); @@ -7470,6 +7908,7 @@ ztest_freeze(void) /* * Open and close the pool and dataset to induce log replay. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); @@ -7519,6 +7958,7 @@ ztest_import(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(); @@ -7543,7 +7983,291 @@ ztest_import(ztest_shared_t *zs) } /* - * Kick off threads to run tests on all datasets in parallel. + * After the expansion was killed, check that the pool is healthy + */ +static void +ztest_raidz_expand_check(spa_t *spa) +{ + ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); + /* + * Set pool check done flag, main program will run a zdb check + * of the pool when we exit. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; + + /* Wait for reflow to finish */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("\nwaiting for reflow to finish ...\n"); + } + pool_raidz_expand_stat_t rzx_stats; + pool_raidz_expand_stat_t *pres = &rzx_stats; + do { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 500); /* wait 1/2 second */ + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } while (pres->pres_state != DSS_FINISHED && + pres->pres_reflowed < pres->pres_to_reflow); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("verifying an interrupted raidz " + "expansion using a pool scrub ...\n"); + } + /* Will fail here if there is non-recoverable corruption detected */ + VERIFY0(ztest_scrub_impl(spa)); + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expansion scrub check complete\n"); + } +} + +/* + * Start a raidz expansion test. We run some I/O on the pool for a while + * to get some data in the pool. Then we grow the raidz and + * kill the test at the requested offset into the reflow, verifying that + * doing such does not lead to pool corruption. + */ +static void +ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) +{ + nvlist_t *root; + pool_raidz_expand_stat_t rzx_stats; + pool_raidz_expand_stat_t *pres = &rzx_stats; + kthread_t **run_threads; + vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; + int total_disks = rzvd->vdev_children; + int data_disks = total_disks - vdev_get_nparity(rzvd); + uint64_t alloc_goal; + uint64_t csize; + int error, t; + int threads = ztest_opts.zo_threads; + ztest_expand_io_t *thread_args; + + ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); + ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops); + ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; + + /* Setup a 1 MiB buffer of random data */ + uint64_t bufsize = 1024 * 1024; + void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); + + if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { + fatal(B_TRUE, "short read from /dev/urandom"); + } + /* + * Put some data in the pool and then attach a vdev to initiate + * reflow. + */ + run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); + thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), + UMEM_NOFAIL); + /* Aim for roughly 25% of allocatable space up to 1GB */ + alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; + alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); + if (ztest_opts.zo_verbose >= 1) { + (void) printf("adding data to pool '%s', goal %llu bytes\n", + ztest_opts.zo_pool, (u_longlong_t)alloc_goal); + } + + /* + * Kick off all the I/O generators that run in parallel. + */ + for (t = 0; t < threads; t++) { + if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { + umem_free(run_threads, threads * sizeof (kthread_t *)); + umem_free(buffer, bufsize); + return; + } + thread_args[t].rzx_id = t; + thread_args[t].rzx_amount = alloc_goal / threads; + thread_args[t].rzx_bufsize = bufsize; + thread_args[t].rzx_buffer = buffer; + thread_args[t].rzx_alloc_max = alloc_goal; + thread_args[t].rzx_spa = spa; + run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, + &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait for all of the writers to complete. + */ + for (t = 0; t < threads; t++) + VERIFY0(thread_join(run_threads[t])); + + /* + * Close all datasets. This must be done after all the threads + * are joined so we can be sure none of the datasets are in-use + * by any of the threads. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(buffer, bufsize); + umem_free(run_threads, threads * sizeof (kthread_t *)); + umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); + + /* Set our reflow target to 25%, 50% or 75% of allocated size */ + uint_t multiple = ztest_random(3) + 1; + uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; + raidz_expand_max_reflow_bytes = reflow_max; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("running raidz expansion test, killing when " + "reflow reaches %llu bytes (%u/4 of allocated space)\n", + (u_longlong_t)reflow_max, multiple); + } + + /* XXX - do we want some I/O load during the reflow? */ + + /* + * Use a disk size that is larger than existing ones + */ + cvd = rzvd->vdev_child[0]; + csize = vdev_get_min_asize(cvd); + csize += csize / 10; + /* + * Path to vdev to be attached + */ + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), + NULL, 0, 0, 1); + /* + * Expand the raidz vdev by attaching the new disk + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", + (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, + newpath); + } + error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); + nvlist_free(root); + if (error != 0) { + fatal(0, "raidz expand: attach (%s %llu) returned %d", + newpath, (long long)csize, error); + } + + /* + * Wait for reflow to begin + */ + while (spa->spa_raidz_expand == NULL) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + } + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (pres->pres_state != DSS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + ASSERT3U(pres->pres_state, ==, DSS_SCANNING); + ASSERT3U(pres->pres_to_reflow, !=, 0); + /* + * Set so when we are killed we go to raidz checking rather than + * restarting test. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expansion reflow started, waiting for " + "%llu bytes to be copied\n", (u_longlong_t)reflow_max); + } + + /* + * Wait for reflow maximum to be reached and then kill the test + */ + while (pres->pres_reflowed < reflow_max) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* Reset the reflow pause before killing */ + raidz_expand_max_reflow_bytes = 0; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("killing raidz expansion test after reflow " + "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); + } + + /* + * Kill ourself to simulate a panic during a reflow. Our parent will + * restart the test and the changed flag value will drive the test + * through the scrub/check code to verify the pool is not corrupted. + */ + ztest_kill(zs); +} + +static void +ztest_generic_run(ztest_shared_t *zs, spa_t *spa) +{ + kthread_t **run_threads; + int t; + + run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), + UMEM_NOFAIL); + + /* + * Kick off all the tests that run in parallel. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { + umem_free(run_threads, ztest_opts.zo_threads * + sizeof (kthread_t *)); + return; + } + + run_threads[t] = thread_create(NULL, 0, ztest_thread, + (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait for all of the tests to complete. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) + VERIFY0(thread_join(run_threads[t])); + + /* + * Close all datasets. This must be done after all the threads + * are joined so we can be sure none of the datasets are in-use + * by any of the threads. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); +} + +/* + * Setup our test context and kick off threads to run tests on all datasets + * in parallel. */ static void ztest_run(ztest_shared_t *zs) @@ -7551,7 +8275,6 @@ ztest_run(ztest_shared_t *zs) spa_t *spa; objset_t *os; kthread_t *resume_thread, *deadman_thread; - kthread_t **run_threads; uint64_t object; int error; int t, d; @@ -7584,6 +8307,7 @@ ztest_run(ztest_shared_t *zs) * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { @@ -7597,7 +8321,11 @@ ztest_run(ztest_shared_t *zs) metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; - VERIFY0(vdev_raidz_impl_set("cycle")); + /* + * XXX - BUGBUG raidz expansion do not run this for generic for now + */ + if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) + VERIFY0(vdev_raidz_impl_set("cycle")); dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, @@ -7607,6 +8335,10 @@ ztest_run(ztest_shared_t *zs) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); dmu_objset_disown(os, B_TRUE, FTAG); + /* Give the dedicated raidz expansion test more grace time */ + if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) + zfs_deadman_synctime_ms *= 2; + /* * Create a thread to periodically resume suspended I/O. */ @@ -7640,6 +8372,10 @@ ztest_run(ztest_shared_t *zs) * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { + /* Not expecting ENOSPC errors during raidz expansion tests */ + ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, + RAIDZ_EXPAND_NONE); + int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); } @@ -7654,9 +8390,12 @@ ztest_run(ztest_shared_t *zs) * that we always run the scrub whenever an indirect vdev exists * because we have no way of knowing for sure if ztest_device_removal() * fully completed its scrub before the pool was reimported. + * + * Does not apply for the RAIDZ expansion specific test runs */ - if (spa->spa_removing_phys.sr_state == DSS_SCANNING || - spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && + (spa->spa_removing_phys.sr_state == DSS_SCANNING || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); @@ -7666,9 +8405,6 @@ ztest_run(ztest_shared_t *zs) ASSERT0(error); } - run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), - UMEM_NOFAIL); - if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); @@ -7681,43 +8417,12 @@ ztest_run(ztest_shared_t *zs) (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, NULL, DS_FIND_CHILDREN); - /* - * Kick off all the tests that run in parallel. - */ - for (t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { - umem_free(run_threads, ztest_opts.zo_threads * - sizeof (kthread_t *)); - return; - } - - run_threads[t] = thread_create(NULL, 0, ztest_thread, - (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, - defclsyspri); - } - - /* - * Wait for all of the tests to complete. - */ - for (t = 0; t < ztest_opts.zo_threads; t++) - VERIFY0(thread_join(run_threads[t])); - - /* - * Close all datasets. This must be done after all the threads - * are joined so we can be sure none of the datasets are in-use - * by any of the threads. - */ - for (t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets) - ztest_dataset_close(t); - } - - txg_wait_synced(spa_get_dsl(spa), 0); - - zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); - - umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); + if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) + ztest_raidz_expand_run(zs, spa); + else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) + ztest_raidz_expand_check(spa); + else + ztest_generic_run(zs, spa); /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; @@ -7826,6 +8531,7 @@ ztest_init(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* @@ -7911,6 +8617,7 @@ shared_data_size(ztest_shared_hdr_t *hdr) size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; + size += hdr->zh_scratch_state_size; return (size); } @@ -7934,6 +8641,7 @@ setup_hdr(void) hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; + hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); @@ -7968,6 +8676,8 @@ setup_data(void) ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; + offset += hdr->zh_ds_size * hdr->zh_ds_count; + ztest_scratch_state = (void *)&buf[offset]; } static boolean_t @@ -8188,13 +8898,14 @@ main(int argc, char **argv) hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { - (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," - "%d %s disks, %"PRIu64" seconds...\n\n", + (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " + "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, ztest_opts.zo_raid_children, ztest_opts.zo_raid_type, + ztest_opts.zo_raid_parity, ztest_opts.zo_time); } @@ -8307,6 +9018,9 @@ main(int argc, char **argv) if (!ztest_opts.zo_mmp_test) ztest_run_zdb(zs->zs_guid); + if (ztest_shared_opts->zo_raidz_expand_test == + RAIDZ_EXPAND_CHECKED) + break; /* raidz expand test complete */ } if (ztest_opts.zo_verbose >= 1) { |