src - FreeBSD source tree

diff options


context:
space:
mode:

author	Martin Matuska <mm@FreeBSD.org>	2023-11-09 10:42:33 +0000
committer	Martin Matuska <mm@FreeBSD.org>	2023-11-09 12:19:17 +0000
commit	e716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch)
tree	3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/module/zfs
parent	f5b3e686292b6502878c64c3c154908024e06eb6 (diff)
parent	887a3c533b94a4b70075e310f15c45b9dee19410 (diff)

zfs: merge openzfs/zfs@887a3c533

Notable upstream pull request merges: #15022 5caeef02f RAID-Z expansion feature #15457 887a3c533 Increase L2ARC write rate and headroom #15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384 Obtained from: OpenZFS OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410

Diffstat (limited to 'sys/contrib/openzfs/module/zfs')

-rw-r--r--

sys/contrib/openzfs/module/zfs/arc.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/dsl_scan.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/metaslab.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/spa.c

240

-rw-r--r--

sys/contrib/openzfs/module/zfs/spa_checkpoint.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/vdev.c

114

-rw-r--r--

sys/contrib/openzfs/module/zfs/vdev_draid.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/vdev_initialize.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/vdev_label.c

-rw-r--r--

sys/contrib/openzfs/module/zfs/vdev_raidz.c

2556

-rw-r--r--

sys/contrib/openzfs/module/zfs/vdev_trim.c

11 files changed, 2817 insertions, 229 deletions

diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index dfea15b74394..2d08cc5e7240 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c

@@ -776,8 +776,8 @@ uint64_t zfs_crc64_table[256];

* Level 2 ARC

-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */

-#define L2ARC_HEADROOM 2 /* num of writes */

+#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */

+#define L2ARC_HEADROOM 8 /* num of writes */

* If we discover during ARC scan any buffers to be compressed, we boost

@@ -4518,7 +4518,7 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)

static void

arc_evict_cb(void *arg, zthr_t *zthr)

{

- (void) arg, (void) zthr;

+ (void) arg;

uint64_t evicted = 0;

fstrans_cookie_t cookie = spl_fstrans_mark();

@@ -4542,9 +4542,13 @@ arc_evict_cb(void *arg, zthr_t *zthr)

* infinite loop. Additionally, zthr_iscancelled() is

* checked here so that if the arc is shutting down, the

* broadcast will wake any remaining arc evict waiters.

+ *

+ * Note we cancel using zthr instead of arc_evict_zthr

+ * because the latter may not yet be initializd when the

+ * callback is first invoked.

mutex_enter(&arc_evict_lock);

- arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&

+ arc_evict_needed = !zthr_iscancelled(zthr) &&

evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;

if (!arc_evict_needed) {

diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index 34012db82dee..e16128fdff87 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c

@@ -3066,7 +3066,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)

scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;

dsl_scan_visit_rootbp(scn, NULL,

&dp->dp_meta_rootbp, tx);

- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);

if (scn->scn_suspending)

return;

diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index e0d4a6a63508..0983ba143a1d 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c

@@ -4342,7 +4342,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)

uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -

metaslab_class_get_alloc(spa_normal_class(spa));

- if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {

+ if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||

+ vd->vdev_rz_expanding) {

defer_allowed = B_FALSE;

}

@@ -4650,6 +4651,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)

ASSERT(MUTEX_HELD(&msp->ms_lock));

VERIFY(!msp->ms_condensing);

VERIFY0(msp->ms_disabled);

+ VERIFY0(msp->ms_new);

start = mc->mc_ops->msop_alloc(msp, size);

if (start != -1ULL) {

@@ -4721,10 +4723,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,

}

- * If the selected metaslab is condensing or disabled,

- * skip it.

+ * If the selected metaslab is condensing or disabled, or

+ * hasn't gone through a metaslab_sync_done(), then skip it.

- if (msp->ms_condensing || msp->ms_disabled > 0)

+ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)

continue;

*was_active = msp->ms_allocator != -1;

@@ -5270,7 +5272,7 @@ top:

ASSERT(mg->mg_class == mc);

- uint64_t asize = vdev_psize_to_asize(vd, psize);

+ uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);

ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 68f367c1c744..20225640f8c5 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c

@@ -63,6 +63,7 @@

#include <sys/vdev_rebuild.h>

#include <sys/vdev_trim.h>

#include <sys/vdev_disk.h>

+#include <sys/vdev_raidz.h>

#include <sys/vdev_draid.h>

#include <sys/metaslab.h>

#include <sys/metaslab_impl.h>

@@ -1709,6 +1710,10 @@ spa_destroy_aux_threads(spa_t *spa)

zthr_destroy(spa->spa_livelist_condense_zthr);

spa->spa_livelist_condense_zthr = NULL;

}

+ if (spa->spa_raidz_expand_zthr != NULL) {

+ zthr_destroy(spa->spa_raidz_expand_zthr);

+ spa->spa_raidz_expand_zthr = NULL;

+ }

}

@@ -1861,6 +1866,8 @@ spa_unload(spa_t *spa)

spa->spa_compatibility = NULL;

}

+ spa->spa_raidz_expand = NULL;

spa_config_exit(spa, SCL_ALL, spa);

}

@@ -2999,6 +3006,7 @@ spa_spawn_aux_threads(spa_t *spa)

ASSERT(MUTEX_HELD(&spa_namespace_lock));

+ spa_start_raidz_expansion_thread(spa);

spa_start_indirect_condensing_thread(spa);

spa_start_livelist_destroy_thread(spa);

spa_start_livelist_condensing_thread(spa);

@@ -3753,6 +3761,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)

}

spa_load_note(spa, "using uberblock with txg=%llu",

(u_longlong_t)ub->ub_txg);

+ if (ub->ub_raidz_reflow_info != 0) {

+ spa_load_note(spa, "uberblock raidz_reflow_info: "

+ "state=%u offset=%llu",

+ (int)RRSS_GET_STATE(ub),

+ (u_longlong_t)RRSS_GET_OFFSET(ub));

+ }

@@ -5092,6 +5106,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)

ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);

+ * Before we do any zio_write's, complete the raidz expansion

+ * scratch space copying, if necessary.

+ */

+ if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)

+ vdev_raidz_reflow_copy_scratch(spa);

+ /*

* In case of a checkpoint rewind, log the original txg

* of the checkpointed uberblock.

@@ -6905,9 +6926,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)

}

- * Attach a device to a mirror. The arguments are the path to any device

- * in the mirror, and the nvroot for the new device. If the path specifies

- * a device that is not mirrored, we automatically insert the mirror vdev.

+ * Attach a device to a vdev specified by its guid. The vdev type can be

+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a

+ * single device). When the vdev is a single device, a mirror vdev will be

+ * automatically inserted.

* If 'replacing' is specified, the new device is intended to replace the

* existing device; in this case the two devices are made into their own

@@ -6930,7 +6952,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,

vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;

vdev_ops_t *pvops;

char *oldvdpath, *newvdpath;

- int newvd_isspare;

+ int newvd_isspare = B_FALSE;

int error;

ASSERT(spa_writeable(spa));

@@ -6961,16 +6983,35 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,

ZFS_ERR_REBUILD_IN_PROGRESS));

}

- if (spa->spa_vdev_removal != NULL)

- return (spa_vdev_exit(spa, NULL, txg, EBUSY));

+ if (spa->spa_vdev_removal != NULL) {

+ return (spa_vdev_exit(spa, NULL, txg,

+ ZFS_ERR_DEVRM_IN_PROGRESS));

+ }

if (oldvd == NULL)

return (spa_vdev_exit(spa, NULL, txg, ENODEV));

- if (!oldvd->vdev_ops->vdev_op_leaf)

+ boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;

+ if (raidz) {

+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))

+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

+ /*

+ * Can't expand a raidz while prior expand is in progress.

+ */

+ if (spa->spa_raidz_expand != NULL) {

+ return (spa_vdev_exit(spa, NULL, txg,

+ ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));

+ }

+ } else if (!oldvd->vdev_ops->vdev_op_leaf) {

return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

+ }

- pvd = oldvd->vdev_parent;

+ if (raidz)

+ pvd = oldvd;

+ else

+ pvd = oldvd->vdev_parent;

if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,

VDEV_ALLOC_ATTACH) != 0)

@@ -7026,6 +7067,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,

* vdev.

if (pvd->vdev_ops != &vdev_mirror_ops &&

+ pvd->vdev_ops != &vdev_raidz_ops &&

pvd->vdev_ops != &vdev_root_ops)

return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

@@ -7065,7 +7107,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,

* Make sure the new device is big enough.

- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))

+ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;

+ if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))

return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));

@@ -7076,31 +7119,74 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,

return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

+ * RAIDZ-expansion-specific checks.

+ */

+ if (raidz) {

+ if (vdev_raidz_attach_check(newvd) != 0)

+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

+ /*

+ * Fail early if a child is not healthy or being replaced

+ */

+ for (int i = 0; i < oldvd->vdev_children; i++) {

+ if (vdev_is_dead(oldvd->vdev_child[i]) ||

+ !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {

+ return (spa_vdev_exit(spa, newrootvd, txg,

+ ENXIO));

+ }

+ /* Also fail if reserved boot area is in-use */

+ if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])

+ != 0) {

+ return (spa_vdev_exit(spa, newrootvd, txg,

+ EADDRINUSE));

+ }

+ if (raidz) {

+ /*

+ * Note: oldvdpath is freed by spa_strfree(), but

+ * kmem_asprintf() is freed by kmem_strfree(), so we have to

+ * move it to a spa_strdup-ed string.

+ */

+ char *tmp = kmem_asprintf("raidz%u-%u",

+ (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);

+ oldvdpath = spa_strdup(tmp);

+ kmem_strfree(tmp);

+ } else {

+ oldvdpath = spa_strdup(oldvd->vdev_path);

+ }

+ newvdpath = spa_strdup(newvd->vdev_path);

+ /*

* If this is an in-place replacement, update oldvd's path and devid

* to make it distinguishable from newvd, and unopenable from now on.

- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {

+ if (strcmp(oldvdpath, newvdpath) == 0) {

spa_strfree(oldvd->vdev_path);

- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,

+ oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,

KM_SLEEP);

- (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,

- "%s/%s", newvd->vdev_path, "old");

+ (void) sprintf(oldvd->vdev_path, "%s/old",

+ newvdpath);

if (oldvd->vdev_devid != NULL) {

spa_strfree(oldvd->vdev_devid);

oldvd->vdev_devid = NULL;

}

+ spa_strfree(oldvdpath);

+ oldvdpath = spa_strdup(oldvd->vdev_path);

}

* If the parent is not a mirror, or if we're replacing, insert the new

* mirror/replacing/spare vdev above oldvd.

- if (pvd->vdev_ops != pvops)

+ if (!raidz && pvd->vdev_ops != pvops) {

pvd = vdev_add_parent(oldvd, pvops);

+ ASSERT(pvd->vdev_ops == pvops);

+ ASSERT(oldvd->vdev_parent == pvd);

+ }

ASSERT(pvd->vdev_top->vdev_parent == rvd);

- ASSERT(pvd->vdev_ops == pvops);

- ASSERT(oldvd->vdev_parent == pvd);

* Extract the new device from its root and add it to pvd.

@@ -7128,41 +7214,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,

dtl_max_txg = txg + TXG_CONCURRENT_STATES;

- vdev_dtl_dirty(newvd, DTL_MISSING,

- TXG_INITIAL, dtl_max_txg - TXG_INITIAL);

+ if (raidz) {

+ /*

+ * Wait for the youngest allocations and frees to sync,

+ * and then wait for the deferral of those frees to finish.

+ */

+ spa_vdev_config_exit(spa, NULL,

+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);

- if (newvd->vdev_isspare) {

- spa_spare_activate(newvd);

- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);

- }

+ vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);

+ vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);

+ vdev_autotrim_stop_wait(tvd);

- oldvdpath = spa_strdup(oldvd->vdev_path);

- newvdpath = spa_strdup(newvd->vdev_path);

- newvd_isspare = newvd->vdev_isspare;

+ dtl_max_txg = spa_vdev_config_enter(spa);

- /*

- * Mark newvd's DTL dirty in this txg.

- */

- vdev_dirty(tvd, VDD_DTL, newvd, txg);

+ tvd->vdev_rz_expanding = B_TRUE;

- /*

- * Schedule the resilver or rebuild to restart in the future. We do

- * this to ensure that dmu_sync-ed blocks have been stitched into the

- * respective datasets.

- */

- if (rebuild) {

- newvd->vdev_rebuild_txg = txg;

+ vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);

+ vdev_config_dirty(tvd);

- vdev_rebuild(tvd);

+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,

+ dtl_max_txg);

+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,

+ newvd, tx);

+ dmu_tx_commit(tx);

} else {

- newvd->vdev_resilver_txg = txg;

+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,

+ dtl_max_txg - TXG_INITIAL);

+ if (newvd->vdev_isspare) {

+ spa_spare_activate(newvd);

+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);

+ }

+ newvd_isspare = newvd->vdev_isspare;

+ /*

+ * Mark newvd's DTL dirty in this txg.

+ */

+ vdev_dirty(tvd, VDD_DTL, newvd, txg);

- if (dsl_scan_resilvering(spa_get_dsl(spa)) &&

- spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {

- vdev_defer_resilver(newvd);

+ /*

+ * Schedule the resilver or rebuild to restart in the future.

+ * We do this to ensure that dmu_sync-ed blocks have been

+ * stitched into the respective datasets.

+ */

+ if (rebuild) {

+ newvd->vdev_rebuild_txg = txg;

+ vdev_rebuild(tvd);

} else {

- dsl_scan_restart_resilver(spa->spa_dsl_pool,

- dtl_max_txg);

+ newvd->vdev_resilver_txg = txg;

+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&

+ spa_feature_is_enabled(spa,

+ SPA_FEATURE_RESILVER_DEFER)) {

+ vdev_defer_resilver(newvd);

+ } else {

+ dsl_scan_restart_resilver(spa->spa_dsl_pool,

+ dtl_max_txg);

+ }

}

@@ -7487,7 +7598,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,

if (cmd_type == POOL_INITIALIZE_START &&

(vd->vdev_initialize_thread != NULL ||

- vd->vdev_top->vdev_removing)) {

+ vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {

mutex_exit(&vd->vdev_initialize_lock);

return (SET_ERROR(EBUSY));

} else if (cmd_type == POOL_INITIALIZE_CANCEL &&

@@ -7609,7 +7720,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,

* which has completed but the thread is not exited.

if (cmd_type == POOL_TRIM_START &&

- (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {

+ (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||

+ vd->vdev_top->vdev_rz_expanding)) {

mutex_exit(&vd->vdev_trim_lock);

return (SET_ERROR(EBUSY));

} else if (cmd_type == POOL_TRIM_CANCEL &&

@@ -8512,6 +8624,10 @@ spa_async_suspend(spa_t *spa)

if (condense_thread != NULL)

zthr_cancel(condense_thread);

+ zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;

+ if (raidz_expand_thread != NULL)

+ zthr_cancel(raidz_expand_thread);

zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;

if (discard_thread != NULL)

zthr_cancel(discard_thread);

@@ -8538,6 +8654,10 @@ spa_async_resume(spa_t *spa)

if (condense_thread != NULL)

zthr_resume(condense_thread);

+ zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;

+ if (raidz_expand_thread != NULL)

+ zthr_resume(raidz_expand_thread);

zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;

if (discard_thread != NULL)

zthr_resume(discard_thread);

@@ -9343,6 +9463,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)

!= NULL)

vdev_sync(vd, txg);

+ if (pass == 1) {

+ /*

+ * dsl_pool_sync() -> dp_sync_tasks may have dirtied

+ * the config. If that happens, this txg should not

+ * be a no-op. So we must sync the config to the MOS

+ * before checking for no-op.

+ *

+ * Note that when the config is dirty, it will

+ * be written to the MOS (i.e. the MOS will be

+ * dirtied) every time we call spa_sync_config_object()

+ * in this txg. Therefore we can't call this after

+ * dsl_pool_sync() every pass, because it would

+ * prevent us from converging, since we'd dirty

+ * the MOS every pass.

+ *

+ * Sync tasks can only be processed in pass 1, so

+ * there's no need to do this in later passes.

+ */

+ spa_sync_config_object(spa, tx);

+ }

* Note: We need to check if the MOS is dirty because we could

* have marked the MOS dirty without updating the uberblock

@@ -10100,7 +10241,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,

DSS_SCANNING);

break;

case ZPOOL_WAIT_RESILVER:

- if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))

+ *in_progress = vdev_rebuild_active(spa->spa_root_vdev);

+ if (*in_progress)

break;

zfs_fallthrough;

case ZPOOL_WAIT_SCRUB:

@@ -10115,6 +10257,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,

is_scrub == (activity == ZPOOL_WAIT_SCRUB));

break;

}

+ case ZPOOL_WAIT_RAIDZ_EXPAND:

+ {

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);

+ break;

+ }

default:

panic("unrecognized value for activity %d", activity);

}

diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
index b588f7041e5c..1efff47f87a0 100644
--- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c

@@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx)

if (spa->spa_removing_phys.sr_state == DSS_SCANNING)

return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));

+ if (spa->spa_raidz_expand != NULL)

+ return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));

if (spa->spa_checkpoint_txg != 0)

return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));

diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index afb01c0ef7fd..c10c78ebf6db 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c

@@ -58,6 +58,7 @@

#include <sys/abd.h>

#include <sys/vdev_initialize.h>

#include <sys/vdev_trim.h>

+#include <sys/vdev_raidz.h>

#include <sys/zvol.h>

#include <sys/zfs_ratelimit.h>

#include "zfs_prop.h"

@@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias)

* all children. This is what's used by anything other than RAID-Z.

uint64_t

-vdev_default_asize(vdev_t *vd, uint64_t psize)

+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)

{

uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);

uint64_t csize;

for (int c = 0; c < vd->vdev_children; c++) {

- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);

+ csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);

asize = MAX(asize, csize);

}

@@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,

&vd->vdev_removing);

(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,

&vd->vdev_top_zap);

+ vd->vdev_rz_expanding = nvlist_exists(nv,

+ ZPOOL_CONFIG_RAIDZ_EXPANDING);

} else {

ASSERT0(vd->vdev_top_zap);

}

@@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio)

vd->vdev_cant_read |= !vps->vps_readable;

vd->vdev_cant_write |= !vps->vps_writeable;

+ vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",

+ vd->vdev_cant_read, vd->vdev_cant_write);

if (vdev_readable(vd) &&

(vdev_writeable(vd) || !spa_writeable(spa))) {

@@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)

}

- * Compute the raidz-deflation ratio. Note, we hard-code

- * in 128k (1 << 17) because it is the "typical" blocksize.

- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,

- * otherwise it would inconsistently account for existing bp's.

+ * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)

+ * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE

+ * changed, this algorithm can not change, otherwise it would inconsistently

+ * account for existing bp's. We also hard-code txg 0 for the same reason

+ * since expanded RAIDZ vdevs can use a different asize for different birth

+ * txg's.

static void

vdev_set_deflate_ratio(vdev_t *vd)

{

if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {

vd->vdev_deflate_ratio = (1 << 17) /

- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);

+ (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>

+ SPA_MINBLOCKSHIFT);

}

@@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,

if (txg != 0)

vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);

- return;

+ } else {

+ mutex_enter(&vd->vdev_dtl_lock);

+ for (int t = 0; t < DTL_TYPES; t++) {

+ /* account for child's outage in parent's missing map */

+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;

+ if (t == DTL_SCRUB) {

+ /* leaf vdevs only */

+ continue;

+ }

+ if (t == DTL_PARTIAL) {

+ /* i.e. non-zero */

+ minref = 1;

+ } else if (vdev_get_nparity(vd) != 0) {

+ /* RAIDZ, DRAID */

+ minref = vdev_get_nparity(vd) + 1;

+ } else {

+ /* any kind of mirror */

+ minref = vd->vdev_children;

+ }

+ space_reftree_create(&reftree);

+ for (int c = 0; c < vd->vdev_children; c++) {

+ vdev_t *cvd = vd->vdev_child[c];

+ mutex_enter(&cvd->vdev_dtl_lock);

+ space_reftree_add_map(&reftree,

+ cvd->vdev_dtl[s], 1);

+ mutex_exit(&cvd->vdev_dtl_lock);

+ }

+ space_reftree_generate_map(&reftree,

+ vd->vdev_dtl[t], minref);

+ space_reftree_destroy(&reftree);

+ }

+ mutex_exit(&vd->vdev_dtl_lock);

}

- mutex_enter(&vd->vdev_dtl_lock);

- for (int t = 0; t < DTL_TYPES; t++) {

- /* account for child's outage in parent's missing map */

- int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;

- if (t == DTL_SCRUB)

- continue; /* leaf vdevs only */

- if (t == DTL_PARTIAL)

- minref = 1; /* i.e. non-zero */

- else if (vdev_get_nparity(vd) != 0)

- minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */

- else

- minref = vd->vdev_children; /* any kind of mirror */

- space_reftree_create(&reftree);

- for (int c = 0; c < vd->vdev_children; c++) {

- vdev_t *cvd = vd->vdev_child[c];

- mutex_enter(&cvd->vdev_dtl_lock);

- space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);

- mutex_exit(&cvd->vdev_dtl_lock);

- }

- space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);

- space_reftree_destroy(&reftree);

+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {

+ raidz_dtl_reassessed(vd);

}

- mutex_exit(&vd->vdev_dtl_lock);

}

@@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd)

vdev_set_deflate_ratio(vd);

+ if (vd->vdev_ops == &vdev_raidz_ops) {

+ error = vdev_raidz_load(vd);

+ if (error != 0)

+ return (error);

+ }

* On spa_load path, grab the allocation bias from our zap

@@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)

dmu_tx_commit(tx);

}

+/*

+ * Return the amount of space that should be (or was) allocated for the given

+ * psize (compressed block size) in the given TXG. Note that for expanded

+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See

+ * vdev_raidz_asize().

+ */

+uint64_t

+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)

+ return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));

uint64_t

vdev_psize_to_asize(vdev_t *vd, uint64_t psize)

{

- return (vd->vdev_ops->vdev_op_asize(vd, psize));

+ return (vdev_psize_to_asize_txg(vd, psize, 0));

}

@@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)

if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)

return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));

- if (!vd->vdev_ops->vdev_op_leaf)

- return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));

wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);

oldstate = vd->vdev_state;

@@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)

vdev_set_deflate_ratio(vd);

- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&

+ if ((vd->vdev_spa->spa_raidz_expand == NULL ||

+ vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&

+ (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&

vdev_is_concrete(vd)) {

vdev_metaslab_group_create(vd);

VERIFY(vdev_metaslab_init(vd, txg) == 0);

@@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)

vdev_prop_add_list(outnvl, propname, NULL,

vd->vdev_removing, ZPROP_SRC_NONE);

continue;

+ case VDEV_PROP_RAIDZ_EXPANDING:

+ /* Only expose this for raidz */

+ if (vd->vdev_ops == &vdev_raidz_ops) {

+ vdev_prop_add_list(outnvl, propname,

+ NULL, vd->vdev_rz_expanding,

+ ZPROP_SRC_NONE);

+ }

+ continue;

/* Numeric Properites */

case VDEV_PROP_ALLOCATING:

/* Leaf vdevs cannot have this property */

diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index 307e2353d020..ec961255fd64 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c

@@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc,

* i.e. vdev_draid_psize_to_asize().

static uint64_t

-vdev_draid_asize(vdev_t *vd, uint64_t psize)

+vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)

{

+ (void) txg;

vdev_draid_config_t *vdc = vd->vdev_tsd;

uint64_t ashift = vd->vdev_ashift;

@@ -960,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,

vdev_draid_config_t *vdc = vd->vdev_tsd;

uint64_t ashift = vd->vdev_top->vdev_ashift;

uint64_t io_size = abd_size;

- uint64_t io_asize = vdev_draid_asize(vd, io_size);

+ uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);

uint64_t group = vdev_draid_offset_to_group(vd, io_offset);

uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);

@@ -1025,15 +1026,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,

ASSERT3U(vdc->vdc_nparity, >, 0);

- raidz_row_t *rr;

- rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);

- rr->rr_cols = groupwidth;

- rr->rr_scols = groupwidth;

+ raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth);

rr->rr_bigcols = bc;

- rr->rr_missingdata = 0;

- rr->rr_missingparity = 0;

rr->rr_firstdatacol = vdc->vdc_nparity;

- rr->rr_abd_empty = NULL;

#ifdef ZFS_DEBUG

rr->rr_offset = io_offset;

rr->rr_size = io_size;

@@ -1053,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,

rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);

rc->rc_offset = physical_offset;

- rc->rc_abd = NULL;

- rc->rc_orig_data = NULL;

- rc->rc_error = 0;

- rc->rc_tried = 0;

- rc->rc_skipped = 0;

- rc->rc_force_repair = 0;

- rc->rc_allow_repair = 1;

- rc->rc_need_orig_restore = B_FALSE;

if (q == 0 && i >= bc)

rc->rc_size = 0;

@@ -1129,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio)

if (size < abd_size) {

vdev_t *vd = zio->io_vd;

- io_offset += vdev_draid_asize(vd, size);

+ io_offset += vdev_draid_asize(vd, size, 0);

abd_offset += size;

abd_size -= size;

nrows++;

@@ -1151,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio)

rm->rm_row[0] = rr[0];

if (nrows == 2)

rm->rm_row[1] = rr[1];

return (rm);

}

@@ -1783,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,

uint64_t phys_birth)

{

uint64_t offset = DVA_GET_OFFSET(dva);

- uint64_t asize = vdev_draid_asize(vd, psize);

+ uint64_t asize = vdev_draid_asize(vd, psize, 0);

if (phys_birth == TXG_UNKNOWN) {

@@ -1840,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)

range_seg64_t logical_rs, physical_rs, remain_rs;

logical_rs.rs_start = rr->rr_offset;

logical_rs.rs_end = logical_rs.rs_start +

- vdev_draid_asize(vd, rr->rr_size);

+ vdev_draid_asize(vd, rr->rr_size, 0);

raidz_col_t *rc = &rr->rr_col[col];

vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index ffdcef1972c3..5aaef1a69986 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c

@@ -48,7 +48,8 @@ static boolean_t

vdev_initialize_should_stop(vdev_t *vd)

{

return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||

- vd->vdev_detached || vd->vdev_top->vdev_removing);

+ vd->vdev_detached || vd->vdev_top->vdev_removing ||

+ vd->vdev_top->vdev_rz_expanding);

}

static void

@@ -67,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)

kmem_free(arg, sizeof (uint64_t));

vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);

- if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))

+ if (vd == NULL || vd->vdev_top->vdev_removing ||

+ !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)

return;

uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];

@@ -631,6 +633,7 @@ vdev_initialize(vdev_t *vd)

ASSERT(!vd->vdev_detached);

ASSERT(!vd->vdev_initialize_exit_wanted);

ASSERT(!vd->vdev_top->vdev_removing);

+ ASSERT(!vd->vdev_top->vdev_rz_expanding);

vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);

vd->vdev_initialize_thread = thread_create(NULL, 0,

@@ -791,13 +794,14 @@ vdev_initialize_restart(vdev_t *vd)

ASSERT(err == 0 || err == ENOENT);

vd->vdev_initialize_action_time = timestamp;

- if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||

- vd->vdev_offline) {

+ if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||

+ vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {

/* load progress for reporting, but don't resume */

VERIFY0(vdev_initialize_load(vd));

} else if (vd->vdev_initialize_state ==

VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&

!vd->vdev_top->vdev_removing &&

+ !vd->vdev_top->vdev_rz_expanding &&

vd->vdev_initialize_thread == NULL) {

vdev_initialize(vd);

}

diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index a2e5524a8391..e8f562a1a6a2 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c

@@ -142,6 +142,7 @@

#include <sys/zap.h>

#include <sys/vdev.h>

#include <sys/vdev_impl.h>

+#include <sys/vdev_raidz.h>

#include <sys/vdev_draid.h>

#include <sys/uberblock_impl.h>

#include <sys/metaslab.h>

@@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)

ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,

sizeof (pcs) / sizeof (uint64_t));

}

+ pool_raidz_expand_stat_t pres;

+ if (spa_raidz_expand_get_stats(spa, &pres) == 0) {

+ fnvlist_add_uint64_array(nvl,

+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres,

+ sizeof (pres) / sizeof (uint64_t));

+ }

}

static void

@@ -1504,7 +1512,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)

}

struct ubl_cbdata {

- uberblock_t *ubl_ubbest; /* Best uberblock */

+ uberblock_t ubl_latest; /* Most recent uberblock */

+ uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */

vdev_t *ubl_vd; /* vdev associated with the above */

};

@@ -1521,6 +1530,9 @@ vdev_uberblock_load_done(zio_t *zio)

if (zio->io_error == 0 && uberblock_verify(ub) == 0) {

mutex_enter(&rio->io_lock);

+ if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) {

+ cbp->ubl_latest = *ub;

+ }

if (ub->ub_txg <= spa->spa_load_max_txg &&

vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {

@@ -1578,10 +1590,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)

ASSERT(config);

memset(ub, 0, sizeof (uberblock_t));

+ memset(&cb, 0, sizeof (cb));

*config = NULL;

cb.ubl_ubbest = ub;

- cb.ubl_vd = NULL;

spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

zio = zio_root(spa, NULL, &cb, flags);

@@ -1598,6 +1610,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)

vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "

"txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);

+ if (ub->ub_raidz_reflow_info !=

+ cb.ubl_latest.ub_raidz_reflow_info) {

+ vdev_dbgmsg(cb.ubl_vd,

+ "spa=%s best uberblock (txg=%llu info=0x%llx) "

+ "has different raidz_reflow_info than latest "

+ "uberblock (txg=%llu info=0x%llx)",

+ spa->spa_name,

+ (u_longlong_t)ub->ub_txg,

+ (u_longlong_t)ub->ub_raidz_reflow_info,

+ (u_longlong_t)cb.ubl_latest.ub_txg,

+ (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info);

+ memset(ub, 0, sizeof (uberblock_t));

+ spa_config_exit(spa, SCL_ALL, FTAG);

+ return;

+ }

*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);

if (*config == NULL && spa->spa_extreme_rewind) {

vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "

@@ -1719,8 +1747,23 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,

vd->vdev_copy_uberblocks = B_FALSE;

}

+ /*

+ * We chose a slot based on the txg. If this uberblock has a special

+ * RAIDZ expansion state, then it is essentially an update of the

+ * current uberblock (it has the same txg). However, the current

+ * state is committed, so we want to write it to a different slot. If

+ * we overwrote the same slot, and we lose power during the uberblock

+ * write, and the disk does not do single-sector overwrites

+ * atomically (even though it is required to - i.e. we should see

+ * either the old or the new uberblock), then we could lose this

+ * txg's uberblock. Rewinding to the previous txg's uberblock may not

+ * be possible because RAIDZ expansion may have already overwritten

+ * some of the data, so we need the progress indicator in the

+ * uberblock.

+ */

int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;

- int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);

+ int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) %

+ (VDEV_UBERBLOCK_COUNT(vd) - m);

/* Copy the uberblock_t into the ABD */

abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);

@@ -1737,7 +1780,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,

}

/* Sync the uberblocks to all vdevs in svd[] */

-static int

+int

vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)

{

spa_t *spa = svd[0]->vdev_spa;

diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 3445fa9d35d5..9d0b8763f16f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c

@@ -27,15 +27,22 @@

#include <sys/zfs_context.h>

#include <sys/spa.h>

+#include <sys/spa_impl.h>

+#include <sys/zap.h>

#include <sys/vdev_impl.h>

+#include <sys/metaslab_impl.h>

#include <sys/zio.h>

#include <sys/zio_checksum.h>

+#include <sys/dmu_tx.h>

#include <sys/abd.h>

+#include <sys/zfs_rlock.h>

#include <sys/fs/zfs.h>

#include <sys/fm/fs/zfs.h>

#include <sys/vdev_raidz.h>

#include <sys/vdev_raidz_impl.h>

#include <sys/vdev_draid.h>

+#include <sys/uberblock_impl.h>

+#include <sys/dsl_scan.h>

#ifdef ZFS_DEBUG

#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */

@@ -135,6 +142,237 @@

VDEV_RAIDZ_64MUL_2((x), mask); \

}

+/*

+ * Big Theory Statement for how a RAIDZ VDEV is expanded

+ *

+ * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion

+ * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs

+ * that have been previously expanded can be expanded again.

+ *

+ * The RAIDZ VDEV must be healthy (must be able to write to all the drives in

+ * the VDEV) when an expansion starts. And the expansion will pause if any

+ * disk in the VDEV fails, and resume once the VDEV is healthy again. All other

+ * operations on the pool can continue while an expansion is in progress (e.g.

+ * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,

+ * and zpool initialize which can't be run during an expansion. Following a

+ * reboot or export/import, the expansion resumes where it left off.

+ *

+ * == Reflowing the Data ==

+ *

+ * The expansion involves reflowing (copying) the data from the current set

+ * of disks to spread it across the new set which now has one more disk. This

+ * reflow operation is similar to reflowing text when the column width of a

+ * text editor window is expanded. The text doesn’t change but the location of

+ * the text changes to accommodate the new width. An example reflow result for

+ * a 4-wide RAIDZ1 to a 5-wide is shown below.

+ *

+ * Reflow End State

+ * Each letter indicates a parity group (logical stripe)

+ *

+ * Before expansion After Expansion

+ * D1 D2 D3 D4 D1 D2 D3 D4 D5

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | A | A | A | A | | A | A | A | A | B |

+ * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | B | B | C | C | | B | C | C | C | C |

+ * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | C | C | D | D | | D | D | E | E | E |

+ * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | E | E | E | E | --> | E | F | F | G | G |

+ * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | F | F | G | G | | G | G | H | H | H |

+ * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | G | G | H | H | | H | I | I | J | J |

+ * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|

+ * +------+------+------+------+ +------+------+------+------+------+

+ * | | | | | | | | | | |

+ * | H | H | I | I | | J | J | | | K |

+ * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|

+ * +------+------+------+------+ +------+------+------+------+------+

+ *

+ * This reflow approach has several advantages. There is no need to read or

+ * modify the block pointers or recompute any block checksums. The reflow

+ * doesn’t need to know where the parity sectors reside. We can read and write

+ * data sequentially and the copy can occur in a background thread in open

+ * context. The design also allows for fast discovery of what data to copy.

+ *

+ * The VDEV metaslabs are processed, one at a time, to copy the block data to

+ * have it flow across all the disks. The metaslab is disabled for allocations

+ * during the copy. As an optimization, we only copy the allocated data which

+ * can be determined by looking at the metaslab range tree. During the copy we

+ * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still

+ * need to be able to survive losing parity count disks). This means we

+ * cannot overwrite data during the reflow that would be needed if a disk is

+ * lost.

+ *

+ * After the reflow completes, all newly-written blocks will have the new

+ * layout, i.e., they will have the parity to data ratio implied by the new

+ * number of disks in the RAIDZ group. Even though the reflow copies all of

+ * the allocated space (data and parity), it is only rearranged, not changed.

+ *

+ * This act of reflowing the data has a few implications about blocks

+ * that were written before the reflow completes:

+ *

+ * - Old blocks will still use the same amount of space (i.e., they will have

+ * the parity to data ratio implied by the old number of disks in the RAIDZ

+ * group).

+ * - Reading old blocks will be slightly slower than before the reflow, for

+ * two reasons. First, we will have to read from all disks in the RAIDZ

+ * VDEV, rather than being able to skip the children that contain only

+ * parity of this block (because the data of a single block is now spread

+ * out across all the disks). Second, in most cases there will be an extra

+ * bcopy, needed to rearrange the data back to its original layout in memory.

+ *

+ * == Scratch Area ==

+ *

+ * As we copy the block data, we can only progress to the point that writes

+ * will not overlap with blocks whose progress has not yet been recorded on

+ * disk. Since partially-copied rows are always read from the old location,

+ * we need to stop one row before the sector-wise overlap, to prevent any

+ * row-wise overlap. For example, in the diagram above, when we reflow sector

+ * B6 it will overwite the original location for B5.

+ *

+ * To get around this, a scratch space is used so that we can start copying

+ * without risking data loss by overlapping the row. As an added benefit, it

+ * improves performance at the beginning of the reflow, but that small perf

+ * boost wouldn't be worth the complexity on its own.

+ *

+ * Ideally we want to copy at least 2 * (new_width)^2 so that we have a

+ * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max

+ * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice

+ * the widths will likely be single digits so we can get a substantial chuck

+ * size using only a few MB of scratch per disk.

+ *

+ * The scratch area is persisted to disk which holds a large amount of reflowed

+ * state. We can always read the partially written stripes when a disk fails or

+ * the copy is interrupted (crash) during the initial copying phase and also

+ * get past a small chunk size restriction. At a minimum, the scratch space

+ * must be large enough to get us to the point that one row does not overlap

+ * itself when moved (i.e new_width^2). But going larger is even better. We

+ * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels

+ * as our scratch space to handle overwriting the initial part of the VDEV.

+ *

+ * 0 256K 512K 4M

+ * +------+------+-----------------------+-----------------------------

+ * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...

+ * | L0 | L1 | Reserved | (Metaslabs)

+ * +------+------+-----------------------+-------------------------------

+ * Scratch Area

+ *

+ * == Reflow Progress Updates ==

+ * After the initial scratch-based reflow, the expansion process works

+ * similarly to device removal. We create a new open context thread which

+ * reflows the data, and periodically kicks off sync tasks to update logical

+ * state. In this case, state is the committed progress (offset of next data

+ * to copy). We need to persist the completed offset on disk, so that if we

+ * crash we know which format each VDEV offset is in.

+ *

+ * == Time Dependent Geometry ==

+ *

+ * In non-expanded RAIDZ, blocks are read from disk in a column by column

+ * fashion. For a multi-row block, the second sector is in the first column

+ * not in the second column. This allows us to issue full reads for each

+ * column directly into the request buffer. The block data is thus laid out

+ * sequentially in a column-by-column fashion.

+ *

+ * For example, in the before expansion diagram above, one logical block might

+ * be sectors G19-H26. The parity is in G19,H23; and the data is in

+ * G20,H24,G21,H25,G22,H26.

+ *

+ * After a block is reflowed, the sectors that were all in the original column

+ * data can now reside in different columns. When reading from an expanded

+ * VDEV, we need to know the logical stripe width for each block so we can

+ * reconstitute the block’s data after the reads are completed. Likewise,

+ * when we perform the combinatorial reconstruction we need to know the

+ * original width so we can retry combinations from the past layouts.

+ *

+ * Time dependent geometry is what we call having blocks with different layouts

+ * (stripe widths) in the same VDEV. This time-dependent geometry uses the

+ * block’s birth time (+ the time expansion ended) to establish the correct

+ * width for a given block. After an expansion completes, we record the time

+ * for blocks written with a particular width (geometry).

+ *

+ * == On Disk Format Changes ==

+ *

+ * New pool feature flag, 'raidz_expansion' whose reference count is the number

+ * of RAIDZ VDEVs that have been expanded.

+ *

+ * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.

+ *

+ * Since the uberblock can point to arbitrary blocks, which might be on the

+ * expanding RAIDZ, and might or might not have been expanded. We need to know

+ * which way a block is laid out before reading it. This info is the next

+ * offset that needs to be reflowed and we persist that in the uberblock, in

+ * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.

+ * After the expansion is complete, we then use the raidz_expand_txgs array

+ * (see below) to determine how to read a block and the ub_raidz_reflow_info

+ * field no longer required.

+ *

+ * The uberblock's ub_raidz_reflow_info field also holds the scratch space

+ * state (i.e., active or not) which is also required before reading a block

+ * during the initial phase of reflowing the data.

+ *

+ * The top-level RAIDZ VDEV has two new entries in the nvlist:

+ *

+ * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here

+ * and used after the expansion is complete to

+ * determine how to read a raidz block

+ * 'raidz_expanding' boolean: present during reflow and removed after completion

+ * used during a spa import to resume an unfinished

+ * expansion

+ *

+ * And finally the VDEVs top zap adds the following informational entries:

+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE

+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME

+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME

+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED

+ */

+/*

+ * For testing only: pause the raidz expansion after reflowing this amount.

+ * (accessed by ZTS and ztest)

+ */

+#ifdef _KERNEL

+static

+#endif /* _KERNEL */

+unsigned long raidz_expand_max_reflow_bytes = 0;

+/*

+ * For testing only: pause the raidz expansion at a certain point.

+ */

+uint_t raidz_expand_pause_point = 0;

+/*

+ * Maximum amount of copy io's outstanding at once.

+ */

+static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;

+/*

+ * Apply raidz map abds aggregation if the number of rows in the map is equal

+ * or greater than the value below.

+ */

+static unsigned long raidz_io_aggregate_rows = 4;

+/*

+ * Automatically start a pool scrub when a RAIDZ expansion completes in

+ * order to verify the checksums of all blocks which have been copied

+ * during the expansion. Automatic scrubbing is enabled by default and

+ * is strongly recommended.

+ */

+static int zfs_scrub_after_expand = 1;

static void

vdev_raidz_row_free(raidz_row_t *rr)

{

@@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm)

for (int i = 0; i < rm->rm_nrows; i++)

vdev_raidz_row_free(rm->rm_row[i]);

+ if (rm->rm_nphys_cols) {

+ for (int i = 0; i < rm->rm_nphys_cols; i++) {

+ if (rm->rm_phys_col[i].rc_abd != NULL)

+ abd_free(rm->rm_phys_col[i].rc_abd);

+ }

+ kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *

+ rm->rm_nphys_cols);

+ }

+ ASSERT3P(rm->rm_lr, ==, NULL);

kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));

}

@@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio)

vdev_raidz_map_free(rm);

}

+static int

+vdev_raidz_reflow_compare(const void *x1, const void *x2)

+ const reflow_node_t *l = x1;

+ const reflow_node_t *r = x2;

+ return (TREE_CMP(l->re_txg, r->re_txg));

const zio_vsd_ops_t vdev_raidz_vsd_ops = {

.vsd_free = vdev_raidz_map_free_vsd,

};

+raidz_row_t *

+vdev_raidz_row_alloc(int cols)

+ raidz_row_t *rr =

+ kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);

+ rr->rr_cols = cols;

+ rr->rr_scols = cols;

+ for (int c = 0; c < cols; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ rc->rc_shadow_devidx = INT_MAX;

+ rc->rc_shadow_offset = UINT64_MAX;

+ rc->rc_allow_repair = 1;

+ }

+ return (rr);

static void

vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)

{

@@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,

uint64_t f = b % dcols;

/* The starting byte offset on each child vdev. */

uint64_t o = (b / dcols) << ashift;

- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;

+ uint64_t acols, scols;

raidz_map_t *rm =

kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);

@@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,

* "Quotient": The number of data sectors for this stripe on all but

* the "big column" child vdevs that also contain "remainder" data.

- q = s / (dcols - nparity);

+ uint64_t q = s / (dcols - nparity);

* "Remainder": The number of partial stripe data sectors in this I/O.

* This will add a sector to some, but not all, child vdevs.

- r = s - q * (dcols - nparity);

+ uint64_t r = s - q * (dcols - nparity);

/* The number of "big columns" - those which contain remainder data. */

- bc = (r == 0 ? 0 : r + nparity);

+ uint64_t bc = (r == 0 ? 0 : r + nparity);

* The total number of data and parity sectors associated with

* this I/O.

- tot = s + nparity * (q + (r == 0 ? 0 : 1));

+ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));

* acols: The columns that will be accessed.

@@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,

}

ASSERT3U(acols, <=, scols);

- rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);

+ rr = vdev_raidz_row_alloc(scols);

rm->rm_row[0] = rr;

rr->rr_cols = acols;

- rr->rr_scols = scols;

rr->rr_bigcols = bc;

- rr->rr_missingdata = 0;

- rr->rr_missingparity = 0;

rr->rr_firstdatacol = nparity;

- rr->rr_abd_empty = NULL;

- rr->rr_nempty = 0;

#ifdef ZFS_DEBUG

rr->rr_offset = zio->io_offset;

rr->rr_size = zio->io_size;

#endif

- asize = 0;

+ uint64_t asize = 0;

- for (c = 0; c < scols; c++) {

+ for (uint64_t c = 0; c < scols; c++) {

raidz_col_t *rc = &rr->rr_col[c];

- col = f + c;

- coff = o;

+ uint64_t col = f + c;

+ uint64_t coff = o;

if (col >= dcols) {

col -= dcols;

coff += 1ULL << ashift;

}

rc->rc_devidx = col;

rc->rc_offset = coff;

- rc->rc_abd = NULL;

- rc->rc_orig_data = NULL;

- rc->rc_error = 0;

- rc->rc_tried = 0;

- rc->rc_skipped = 0;

- rc->rc_force_repair = 0;

- rc->rc_allow_repair = 1;

- rc->rc_need_orig_restore = B_FALSE;

if (c >= acols)

rc->rc_size = 0;

@@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,

ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);

if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {

- devidx = rr->rr_col[0].rc_devidx;

+ uint64_t devidx = rr->rr_col[0].rc_devidx;

o = rr->rr_col[0].rc_offset;

rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;

rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;

rr->rr_col[1].rc_devidx = devidx;

rr->rr_col[1].rc_offset = o;

if (rm->rm_skipstart == 0)

rm->rm_skipstart = 1;

}

@@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,

} else {

vdev_raidz_map_alloc_read(zio, rm);

}

+ /* init RAIDZ parity ops */

+ rm->rm_ops = vdev_raidz_math_get_ops();

+ return (rm);

+/*

+ * Everything before reflow_offset_synced should have been moved to the new

+ * location (read and write completed). However, this may not yet be reflected

+ * in the on-disk format (e.g. raidz_reflow_sync() has been called but the

+ * uberblock has not yet been written). If reflow is not in progress,

+ * reflow_offset_synced should be UINT64_MAX. For each row, if the row is

+ * entirely before reflow_offset_synced, it will come from the new location.

+ * Otherwise this row will come from the old location. Therefore, rows that

+ * straddle the reflow_offset_synced will come from the old location.

+ *

+ * For writes, reflow_offset_next is the next offset to copy. If a sector has

+ * been copied, but not yet reflected in the on-disk progress

+ * (reflow_offset_synced), it will also be written to the new (already copied)

+ * offset.

+ */

+noinline raidz_map_t *

+vdev_raidz_map_alloc_expanded(zio_t *zio,

+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,

+ uint64_t nparity, uint64_t reflow_offset_synced,

+ uint64_t reflow_offset_next, boolean_t use_scratch)

+ abd_t *abd = zio->io_abd;

+ uint64_t offset = zio->io_offset;

+ uint64_t size = zio->io_size;

+ /* The zio's size in units of the vdev's minimum sector size. */

+ uint64_t s = size >> ashift;

+ /*

+ * "Quotient": The number of data sectors for this stripe on all but

+ * the "big column" child vdevs that also contain "remainder" data.

+ * AKA "full rows"

+ */

+ uint64_t q = s / (logical_cols - nparity);

+ /*

+ * "Remainder": The number of partial stripe data sectors in this I/O.

+ * This will add a sector to some, but not all, child vdevs.

+ */

+ uint64_t r = s - q * (logical_cols - nparity);

+ /* The number of "big columns" - those which contain remainder data. */

+ uint64_t bc = (r == 0 ? 0 : r + nparity);

+ /*

+ * The total number of data and parity sectors associated with

+ * this I/O.

+ */

+ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));

+ /* How many rows contain data (not skip) */

+ uint64_t rows = howmany(tot, logical_cols);

+ int cols = MIN(tot, logical_cols);

+ raidz_map_t *rm =

+ kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),

+ KM_SLEEP);

+ rm->rm_nrows = rows;

+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;

+ rm->rm_skipstart = bc;

+ uint64_t asize = 0;

+ for (uint64_t row = 0; row < rows; row++) {

+ boolean_t row_use_scratch = B_FALSE;

+ raidz_row_t *rr = vdev_raidz_row_alloc(cols);

+ rm->rm_row[row] = rr;

+ /* The starting RAIDZ (parent) vdev sector of the row. */

+ uint64_t b = (offset >> ashift) + row * logical_cols;

+ /*

+ * If we are in the middle of a reflow, and the copying has

+ * not yet completed for any part of this row, then use the

+ * old location of this row. Note that reflow_offset_synced

+ * reflects the i/o that's been completed, because it's

+ * updated by a synctask, after zio_wait(spa_txg_zio[]).

+ * This is sufficient for our check, even if that progress

+ * has not yet been recorded to disk (reflected in

+ * spa_ubsync). Also note that we consider the last row to

+ * be "full width" (`cols`-wide rather than `bc`-wide) for

+ * this calculation. This causes a tiny bit of unnecessary

+ * double-writes but is safe and simpler to calculate.

+ */

+ int row_phys_cols = physical_cols;

+ if (b + cols > reflow_offset_synced >> ashift)

+ row_phys_cols--;

+ else if (use_scratch)

+ row_use_scratch = B_TRUE;

+ /* starting child of this row */

+ uint64_t child_id = b % row_phys_cols;

+ /* The starting byte offset on each child vdev. */

+ uint64_t child_offset = (b / row_phys_cols) << ashift;

+ /*

+ * Note, rr_cols is the entire width of the block, even

+ * if this row is shorter. This is needed because parity

+ * generation (for Q and R) needs to know the entire width,

+ * because it treats the short row as though it was

+ * full-width (and the "phantom" sectors were zero-filled).

+ *

+ * Another approach to this would be to set cols shorter

+ * (to just the number of columns that we might do i/o to)

+ * and have another mechanism to tell the parity generation

+ * about the "entire width". Reconstruction (at least

+ * vdev_raidz_reconstruct_general()) would also need to

+ * know about the "entire width".

+ */

+ rr->rr_firstdatacol = nparity;

+#ifdef ZFS_DEBUG

+ /*

+ * note: rr_size is PSIZE, not ASIZE

+ */

+ rr->rr_offset = b << ashift;

+ rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;

+#endif

+ for (int c = 0; c < rr->rr_cols; c++, child_id++) {

+ if (child_id >= row_phys_cols) {

+ child_id -= row_phys_cols;

+ child_offset += 1ULL << ashift;

+ }

+ raidz_col_t *rc = &rr->rr_col[c];

+ rc->rc_devidx = child_id;

+ rc->rc_offset = child_offset;

+ /*

+ * Get this from the scratch space if appropriate.

+ * This only happens if we crashed in the middle of

+ * raidz_reflow_scratch_sync() (while it's running,

+ * the rangelock prevents us from doing concurrent

+ * io), and even then only during zpool import or

+ * when the pool is imported readonly.

+ */

+ if (row_use_scratch)

+ rc->rc_offset -= VDEV_BOOT_SIZE;

+ uint64_t dc = c - rr->rr_firstdatacol;

+ if (c < rr->rr_firstdatacol) {

+ rc->rc_size = 1ULL << ashift;

+ /*

+ * Parity sectors' rc_abd's are set below

+ * after determining if this is an aggregation.

+ */

+ } else if (row == rows - 1 && bc != 0 && c >= bc) {

+ /*

+ * Past the end of the block (even including

+ * skip sectors). This sector is part of the

+ * map so that we have full rows for p/q parity

+ * generation.

+ */

+ rc->rc_size = 0;

+ rc->rc_abd = NULL;

+ } else {

+ /* "data column" (col excluding parity) */

+ uint64_t off;

+ if (c < bc || r == 0) {

+ off = dc * rows + row;

+ } else {

+ off = r * rows +

+ (dc - r) * (rows - 1) + row;

+ }

+ rc->rc_size = 1ULL << ashift;

+ rc->rc_abd = abd_get_offset_struct(

+ &rc->rc_abdstruct, abd, off << ashift,

+ rc->rc_size);

+ }

+ if (rc->rc_size == 0)

+ continue;

+ /*

+ * If any part of this row is in both old and new

+ * locations, the primary location is the old

+ * location. If this sector was already copied to the

+ * new location, we need to also write to the new,

+ * "shadow" location.

+ *

+ * Note, `row_phys_cols != physical_cols` indicates

+ * that the primary location is the old location.

+ * `b+c < reflow_offset_next` indicates that the copy

+ * to the new location has been initiated. We know

+ * that the copy has completed because we have the

+ * rangelock, which is held exclusively while the

+ * copy is in progress.

+ */

+ if (row_use_scratch ||

+ (row_phys_cols != physical_cols &&

+ b + c < reflow_offset_next >> ashift)) {

+ rc->rc_shadow_devidx = (b + c) % physical_cols;

+ rc->rc_shadow_offset =

+ ((b + c) / physical_cols) << ashift;

+ if (row_use_scratch)

+ rc->rc_shadow_offset -= VDEV_BOOT_SIZE;

+ }

+ asize += rc->rc_size;

+ }

+ /*

+ * See comment in vdev_raidz_map_alloc()

+ */

+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&

+ (offset & (1ULL << 20))) {

+ ASSERT(rr->rr_cols >= 2);

+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);

+ int devidx0 = rr->rr_col[0].rc_devidx;

+ uint64_t offset0 = rr->rr_col[0].rc_offset;

+ int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;

+ uint64_t shadow_offset0 =

+ rr->rr_col[0].rc_shadow_offset;

+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;

+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;

+ rr->rr_col[0].rc_shadow_devidx =

+ rr->rr_col[1].rc_shadow_devidx;

+ rr->rr_col[0].rc_shadow_offset =

+ rr->rr_col[1].rc_shadow_offset;

+ rr->rr_col[1].rc_devidx = devidx0;

+ rr->rr_col[1].rc_offset = offset0;

+ rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;

+ rr->rr_col[1].rc_shadow_offset = shadow_offset0;

+ }

+ ASSERT3U(asize, ==, tot << ashift);

+ /*

+ * Determine if the block is contiguous, in which case we can use

+ * an aggregation.

+ */

+ if (rows >= raidz_io_aggregate_rows) {

+ rm->rm_nphys_cols = physical_cols;

+ rm->rm_phys_col =

+ kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,

+ KM_SLEEP);

+ /*

+ * Determine the aggregate io's offset and size, and check

+ * that the io is contiguous.

+ */

+ for (int i = 0;

+ i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {

+ raidz_row_t *rr = rm->rm_row[i];

+ for (int c = 0; c < rr->rr_cols; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ raidz_col_t *prc =

+ &rm->rm_phys_col[rc->rc_devidx];

+ if (rc->rc_size == 0)

+ continue;

+ if (prc->rc_size == 0) {

+ ASSERT0(prc->rc_offset);

+ prc->rc_offset = rc->rc_offset;

+ } else if (prc->rc_offset + prc->rc_size !=

+ rc->rc_offset) {

+ /*

+ * This block is not contiguous and

+ * therefore can't be aggregated.

+ * This is expected to be rare, so

+ * the cost of allocating and then

+ * freeing rm_phys_col is not

+ * significant.

+ */

+ kmem_free(rm->rm_phys_col,

+ sizeof (raidz_col_t) *

+ rm->rm_nphys_cols);

+ rm->rm_phys_col = NULL;

+ rm->rm_nphys_cols = 0;

+ break;

+ }

+ prc->rc_size += rc->rc_size;

+ }

+ if (rm->rm_phys_col != NULL) {

+ /*

+ * Allocate aggregate ABD's.

+ */

+ for (int i = 0; i < rm->rm_nphys_cols; i++) {

+ raidz_col_t *prc = &rm->rm_phys_col[i];

+ prc->rc_devidx = i;

+ if (prc->rc_size == 0)

+ continue;

+ prc->rc_abd =

+ abd_alloc_linear(rm->rm_phys_col[i].rc_size,

+ B_FALSE);

+ }

+ /*

+ * Point the parity abd's into the aggregate abd's.

+ */

+ for (int i = 0; i < rm->rm_nrows; i++) {

+ raidz_row_t *rr = rm->rm_row[i];

+ for (int c = 0; c < rr->rr_firstdatacol; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ raidz_col_t *prc =

+ &rm->rm_phys_col[rc->rc_devidx];

+ rc->rc_abd =

+ abd_get_offset_struct(&rc->rc_abdstruct,

+ prc->rc_abd,

+ rc->rc_offset - prc->rc_offset,

+ rc->rc_size);

+ }

+ } else {

+ /*

+ * Allocate new abd's for the parity sectors.

+ */

+ for (int i = 0; i < rm->rm_nrows; i++) {

+ raidz_row_t *rr = rm->rm_row[i];

+ for (int c = 0; c < rr->rr_firstdatacol; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ rc->rc_abd =

+ abd_alloc_linear(rc->rc_size,

+ B_TRUE);

+ }

/* init RAIDZ parity ops */

rm->rm_ops = vdev_raidz_math_get_ops();

@@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private)

{

struct pqr_struct *pqr = private;

const uint64_t *src = buf;

- int i, cnt = size / sizeof (src[0]);

+ int cnt = size / sizeof (src[0]);

ASSERT(pqr->p && !pqr->q && !pqr->r);

- for (i = 0; i < cnt; i++, src++, pqr->p++)

+ for (int i = 0; i < cnt; i++, src++, pqr->p++)

*pqr->p ^= *src;

return (0);

@@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private)

struct pqr_struct *pqr = private;

const uint64_t *src = buf;

uint64_t mask;

- int i, cnt = size / sizeof (src[0]);

+ int cnt = size / sizeof (src[0]);

ASSERT(pqr->p && pqr->q && !pqr->r);

- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {

+ for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {

*pqr->p ^= *src;

VDEV_RAIDZ_64MUL_2(*pqr->q, mask);

*pqr->q ^= *src;

@@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private)

struct pqr_struct *pqr = private;

const uint64_t *src = buf;

uint64_t mask;

- int i, cnt = size / sizeof (src[0]);

+ int cnt = size / sizeof (src[0]);

ASSERT(pqr->p && pqr->q && pqr->r);

- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {

+ for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {

*pqr->p ^= *src;

VDEV_RAIDZ_64MUL_2(*pqr->q, mask);

*pqr->q ^= *src;

@@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr)

void

vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)

{

- ASSERT3U(rr->rr_cols, !=, 0);

+ if (rr->rr_cols == 0) {

+ /*

+ * We are handling this block one row at a time (because

+ * this block has a different logical vs physical width,

+ * due to RAIDZ expansion), and this is a pad-only row,

+ * which has no parity.

+ */

+ return;

+ }

/* Generate using the new math implementation */

if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)

@@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)

int x = tgts[0];

abd_t *dst, *src;

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)

+ zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);

ASSERT3U(ntgts, ==, 1);

ASSERT3U(x, >=, rr->rr_firstdatacol);

ASSERT3U(x, <, rr->rr_cols);

@@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)

int c, exp;

abd_t *dst, *src;

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)

+ zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);

ASSERT(ntgts == 1);

ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);

@@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)

int y = tgts[1];

abd_t *xd, *yd;

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)

+ zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);

ASSERT(ntgts == 2);

ASSERT(x < y);

ASSERT(x >= rr->rr_firstdatacol);

@@ -1295,11 +1903,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)

abd_t **bufs = NULL;

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)

+ zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);

* Matrix reconstruction can't use scatter ABDs yet, so we allocate

* temporary linear ABDs if any non-linear ABDs are found.

for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {

+ ASSERT(rr->rr_col[i].rc_abd != NULL);

if (!abd_is_linear(rr->rr_col[i].rc_abd)) {

bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),

KM_PUSHPAGE);

@@ -1427,10 +2038,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,

int nbadparity, nbaddata;

int parity_valid[VDEV_RAIDZ_MAXPARITY];

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {

+ zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",

+ rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,

+ (int)rr->rr_missingparity);

+ }

nbadparity = rr->rr_firstdatacol;

nbaddata = rr->rr_cols - nbadparity;

ntgts = 0;

for (i = 0, c = 0; c < rr->rr_cols; c++) {

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {

+ zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "

+ "offset=%llx error=%u)",

+ rr, c, (int)rr->rr_col[c].rc_devidx,

+ (long long)rr->rr_col[c].rc_offset,

+ (int)rr->rr_col[c].rc_error);

+ }

if (c < rr->rr_firstdatacol)

parity_valid[c] = B_FALSE;

@@ -1537,8 +2161,15 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,

*physical_ashift, cvd->vdev_physical_ashift);

}

- *asize *= vd->vdev_children;

- *max_asize *= vd->vdev_children;

+ if (vd->vdev_rz_expanding) {

+ *asize *= vd->vdev_children - 1;

+ *max_asize *= vd->vdev_children - 1;

+ vd->vdev_min_asize = *asize;

+ } else {

+ *asize *= vd->vdev_children;

+ *max_asize *= vd->vdev_children;

+ }

if (numerrors > nparity) {

vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;

@@ -1557,19 +2188,71 @@ vdev_raidz_close(vdev_t *vd)

}

+/*

+ * Return the logical width to use, given the txg in which the allocation

+ * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the

+ * BP was allocated. Remapped BP's (that were relocated due to device

+ * removal, see remap_blkptr_cb()), will have a more recent

+ * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can

+ * ignore these because they can't be on RAIDZ (device removal doesn't

+ * support RAIDZ).

+ */

+static uint64_t

+vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)

+ reflow_node_t lookup = {

+ .re_txg = txg,

+ };

+ avl_index_t where;

+ uint64_t width;

+ mutex_enter(&vdrz->vd_expand_lock);

+ reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);

+ if (re != NULL) {

+ width = re->re_logical_width;

+ } else {

+ re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);

+ if (re != NULL)

+ width = re->re_logical_width;

+ else

+ width = vdrz->vd_original_width;

+ }

+ mutex_exit(&vdrz->vd_expand_lock);

+ return (width);

+/*

+ * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated

+ * more space due to the lower data-to-parity ratio. In this case it's

+ * important to pass in the correct txg. Note that vdev_gang_header_asize()

+ * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,

+ * regardless of txg. This is assured because for a single data sector, we

+ * allocate P+1 sectors regardless of width ("cols", which is at least P+1).

+ */

static uint64_t

-vdev_raidz_asize(vdev_t *vd, uint64_t psize)

+vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)

{

vdev_raidz_t *vdrz = vd->vdev_tsd;

uint64_t asize;

uint64_t ashift = vd->vdev_top->vdev_ashift;

- uint64_t cols = vdrz->vd_logical_width;

+ uint64_t cols = vdrz->vd_original_width;

uint64_t nparity = vdrz->vd_nparity;

+ cols = vdev_raidz_get_logical_width(vdrz, txg);

asize = ((psize - 1) >> ashift) + 1;

asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));

asize = roundup(asize, nparity + 1) << ashift;

+#ifdef ZFS_DEBUG

+ uint64_t asize_new = ((psize - 1) >> ashift) + 1;

+ uint64_t ncols_new = vdrz->vd_physical_width;

+ asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /

+ (ncols_new - nparity));

+ asize_new = roundup(asize_new, nparity + 1) << ashift;

+ VERIFY3U(asize_new, <=, asize);

+#endif

return (asize);

}

@@ -1596,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio)

}

static void

-vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)

+vdev_raidz_shadow_child_done(zio_t *zio)

{

-#ifdef ZFS_DEBUG

- vdev_t *tvd = vd->vdev_top;

+ raidz_col_t *rc = zio->io_private;

+ rc->rc_shadow_error = zio->io_error;

+static void

+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)

+ (void) rm;

+#ifdef ZFS_DEBUG

range_seg64_t logical_rs, physical_rs, remain_rs;

logical_rs.rs_start = rr->rr_offset;

logical_rs.rs_end = logical_rs.rs_start +

- vdev_raidz_asize(vd, rr->rr_size);

+ vdev_raidz_asize(zio->io_vd, rr->rr_size,

+ BP_PHYSICAL_BIRTH(zio->io_bp));

raidz_col_t *rc = &rr->rr_col[col];

- vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

+ vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];

vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);

ASSERT(vdev_xlate_is_empty(&remain_rs));

+ if (vdev_xlate_is_empty(&physical_rs)) {

+ /*

+ * If we are in the middle of expansion, the

+ * physical->logical mapping is changing so vdev_xlate()

+ * can't give us a reliable answer.

+ */

+ return;

+ }

ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);

ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);

@@ -1621,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)

if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {

ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +

- rc->rc_size + (1 << tvd->vdev_ashift));

+ rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));

} else {

ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);

}

@@ -1629,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)

}

static void

-vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)

+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)

{

vdev_t *vd = zio->io_vd;

raidz_map_t *rm = zio->io_vsd;

@@ -1641,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)

vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

/* Verify physical to logical translation */

- vdev_raidz_io_verify(vd, rr, c);

+ vdev_raidz_io_verify(zio, rm, rr, c);

- if (rc->rc_size > 0) {

- ASSERT3P(rc->rc_abd, !=, NULL);

- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,

- rc->rc_offset, rc->rc_abd,

- abd_get_size(rc->rc_abd), zio->io_type,

- zio->io_priority, 0, vdev_raidz_child_done, rc));

- } else {

- /*

- * Generate optional write for skip sector to improve

- * aggregation contiguity.

- */

- ASSERT3P(rc->rc_abd, ==, NULL);

- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,

- rc->rc_offset, NULL, 1ULL << ashift,

- zio->io_type, zio->io_priority,

- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,

- NULL));

+ if (rc->rc_size == 0)

+ continue;

+ ASSERT3U(rc->rc_offset + rc->rc_size, <,

+ cvd->vdev_psize - VDEV_LABEL_END_SIZE);

+ ASSERT3P(rc->rc_abd, !=, NULL);

+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,

+ rc->rc_offset, rc->rc_abd,

+ abd_get_size(rc->rc_abd), zio->io_type,

+ zio->io_priority, 0, vdev_raidz_child_done, rc));

+ if (rc->rc_shadow_devidx != INT_MAX) {

+ vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];

+ ASSERT3U(

+ rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,

+ cvd2->vdev_psize - VDEV_LABEL_END_SIZE);

+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,

+ rc->rc_shadow_offset, rc->rc_abd,

+ abd_get_size(rc->rc_abd),

+ zio->io_type, zio->io_priority, 0,

+ vdev_raidz_shadow_child_done, rc));

}

+/*

+ * Generate optional I/Os for skip sectors to improve aggregation contiguity.

+ * This only works for vdev_raidz_map_alloc() (not _expanded()).

+ */

+static void

+raidz_start_skip_writes(zio_t *zio)

+ vdev_t *vd = zio->io_vd;

+ uint64_t ashift = vd->vdev_top->vdev_ashift;

+ raidz_map_t *rm = zio->io_vsd;

+ ASSERT3U(rm->rm_nrows, ==, 1);

+ raidz_row_t *rr = rm->rm_row[0];

+ for (int c = 0; c < rr->rr_scols; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

+ if (rc->rc_size != 0)

+ continue;

+ ASSERT3P(rc->rc_abd, ==, NULL);

+ ASSERT3U(rc->rc_offset, <,

+ cvd->vdev_psize - VDEV_LABEL_END_SIZE);

+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,

+ NULL, 1ULL << ashift, zio->io_type, zio->io_priority,

+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));

+ }

static void

-vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)

+vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)

{

vdev_t *vd = zio->io_vd;

@@ -1697,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)

rc->rc_skipped = 1;

continue;

}

- if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||

+ if (forceparity ||

+ c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||

(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {

zio_nowait(zio_vdev_child_io(zio, NULL, cvd,

rc->rc_offset, rc->rc_abd, rc->rc_size,

@@ -1707,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)

}

+static void

+vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)

+ vdev_t *vd = zio->io_vd;

+ for (int i = 0; i < rm->rm_nphys_cols; i++) {

+ raidz_col_t *prc = &rm->rm_phys_col[i];

+ if (prc->rc_size == 0)

+ continue;

+ ASSERT3U(prc->rc_devidx, ==, i);

+ vdev_t *cvd = vd->vdev_child[i];

+ if (!vdev_readable(cvd)) {

+ prc->rc_error = SET_ERROR(ENXIO);

+ prc->rc_tried = 1; /* don't even try */

+ prc->rc_skipped = 1;

+ continue;

+ }

+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {

+ prc->rc_error = SET_ERROR(ESTALE);

+ prc->rc_skipped = 1;

+ continue;

+ }

+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,

+ prc->rc_offset, prc->rc_abd, prc->rc_size,

+ zio->io_type, zio->io_priority, 0,

+ vdev_raidz_child_done, prc));

+ }

+static void

+vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)

+ /*

+ * If there are multiple rows, we will be hitting

+ * all disks, so go ahead and read the parity so

+ * that we are reading in decent size chunks.

+ */

+ boolean_t forceparity = rm->rm_nrows > 1;

+ if (rm->rm_phys_col) {

+ vdev_raidz_io_start_read_phys_cols(zio, rm);

+ } else {

+ for (int i = 0; i < rm->rm_nrows; i++) {

+ raidz_row_t *rr = rm->rm_row[i];

+ vdev_raidz_io_start_read_row(zio, rr, forceparity);

+ }

* Start an IO operation on a RAIDZ VDev

@@ -1730,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio)

vdev_t *vd = zio->io_vd;

vdev_t *tvd = vd->vdev_top;

vdev_raidz_t *vdrz = vd->vdev_tsd;

+ raidz_map_t *rm;

+ uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,

+ BP_PHYSICAL_BIRTH(zio->io_bp));

+ if (logical_width != vdrz->vd_physical_width) {

+ zfs_locked_range_t *lr = NULL;

+ uint64_t synced_offset = UINT64_MAX;

+ uint64_t next_offset = UINT64_MAX;

+ boolean_t use_scratch = B_FALSE;

+ /*

+ * Note: when the expansion is completing, we set

+ * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())

+ * in a later txg than when we last update spa_ubsync's state

+ * (see the end of spa_raidz_expand_thread()). Therefore we

+ * may see vre_state!=SCANNING before

+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected

+ * on disk, but the copying progress has been synced to disk

+ * (and reflected in spa_ubsync). In this case it's fine to

+ * treat the expansion as completed, since if we crash there's

+ * no additional copying to do.

+ */

+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {

+ ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,

+ &vdrz->vn_vre);

+ lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,

+ zio->io_offset, zio->io_size, RL_READER);

+ use_scratch =

+ (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==

+ RRSS_SCRATCH_VALID);

+ synced_offset =

+ RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);

+ next_offset = vdrz->vn_vre.vre_offset;

+ /*

+ * If we haven't resumed expanding since importing the

+ * pool, vre_offset won't have been set yet. In

+ * this case the next offset to be copied is the same

+ * as what was synced.

+ */

+ if (next_offset == UINT64_MAX) {

+ next_offset = synced_offset;

+ }

+ if (use_scratch) {

+ zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="

+ "%lld next_offset=%lld use_scratch=%u",

+ zio,

+ zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",

+ (long long)zio->io_offset,

+ (long long)synced_offset,

+ (long long)next_offset,

+ use_scratch);

+ }

+ rm = vdev_raidz_map_alloc_expanded(zio,

+ tvd->vdev_ashift, vdrz->vd_physical_width,

+ logical_width, vdrz->vd_nparity,

+ synced_offset, next_offset, use_scratch);

+ rm->rm_lr = lr;

+ } else {

+ rm = vdev_raidz_map_alloc(zio,

+ tvd->vdev_ashift, logical_width, vdrz->vd_nparity);

+ }

+ rm->rm_original_width = vdrz->vd_original_width;

- raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,

- vdrz->vd_logical_width, vdrz->vd_nparity);

zio->io_vsd = rm;

zio->io_vsd_ops = &vdev_raidz_vsd_ops;

- /*

- * Until raidz expansion is implemented all maps for a raidz vdev

- * contain a single row.

- */

- ASSERT3U(rm->rm_nrows, ==, 1);

- raidz_row_t *rr = rm->rm_row[0];

if (zio->io_type == ZIO_TYPE_WRITE) {

- vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);

+ for (int i = 0; i < rm->rm_nrows; i++) {

+ vdev_raidz_io_start_write(zio, rm->rm_row[i]);

+ }

+ if (logical_width == vdrz->vd_physical_width) {

+ raidz_start_skip_writes(zio);

+ }

} else {

ASSERT(zio->io_type == ZIO_TYPE_READ);

- vdev_raidz_io_start_read(zio, rr);

+ vdev_raidz_io_start_read(zio, rm);

}

zio_execute(zio);

@@ -1847,6 +2691,8 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)

continue;

if (abd_cmp(orig[c], rc->rc_abd) != 0) {

+ zfs_dbgmsg("found error on col=%u devidx=%u off %llx",

+ c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);

vdev_raidz_checksum_error(zio, rc, orig[c]);

rc->rc_error = SET_ERROR(ECKSUM);

ret++;

@@ -1862,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr)

{

int error = 0;

- for (int c = 0; c < rr->rr_cols; c++)

+ for (int c = 0; c < rr->rr_cols; c++) {

error = zio_worst_error(error, rr->rr_col[c].rc_error);

+ error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);

+ }

return (error);

}

@@ -1929,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)

continue;

}

+ zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "

+ "offset=%llx",

+ zio, c, rc->rc_devidx, (long long)rc->rc_offset);

zio_nowait(zio_vdev_child_io(zio, NULL, cvd,

rc->rc_offset, rc->rc_abd, rc->rc_size,

ZIO_TYPE_WRITE,

@@ -1938,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)

ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));

}

+ /*

+ * Scrub or resilver i/o's: overwrite any shadow locations with the

+ * good data. This ensures that if we've already copied this sector,

+ * it will be corrected if it was damaged. This writes more than is

+ * necessary, but since expansion is paused during scrub/resilver, at

+ * most a single row will have a shadow location.

+ */

+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&

+ (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {

+ for (int c = 0; c < rr->rr_cols; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ vdev_t *vd = zio->io_vd;

+ if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)

+ continue;

+ vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];

+ /*

+ * Note: We don't want to update the repair stats

+ * because that would incorrectly indicate that there

+ * was bad data to repair, which we aren't sure about.

+ * By clearing the SCAN_THREAD flag, we prevent this

+ * from happening, despite having the REPAIR flag set.

+ * We need to set SELF_HEAL so that this i/o can't be

+ * bypassed by zio_vdev_io_start().

+ */

+ zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,

+ rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,

+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,

+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,

+ NULL, NULL);

+ cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;

+ zio_nowait(cio);

+ }

}

static void

@@ -1957,6 +2845,43 @@ raidz_restore_orig_data(raidz_map_t *rm)

}

+ * During raidz_reconstruct() for expanded VDEV, we need special consideration

+ * failure simulations. See note in raidz_reconstruct() on simulating failure

+ * of a pre-expansion device.

+ *

+ * Treating logical child i as failed, return TRUE if the given column should

+ * be treated as failed. The idea of logical children allows us to imagine

+ * that a disk silently failed before a RAIDZ expansion (reads from this disk

+ * succeed but return the wrong data). Since the expansion doesn't verify

+ * checksums, the incorrect data will be moved to new locations spread among

+ * the children (going diagonally across them).

+ *

+ * Higher "logical child failures" (values of `i`) indicate these

+ * "pre-expansion failures". The first physical_width values imagine that a

+ * current child failed; the next physical_width-1 values imagine that a

+ * child failed before the most recent expansion; the next physical_width-2

+ * values imagine a child failed in the expansion before that, etc.

+ */

+static boolean_t

+raidz_simulate_failure(int physical_width, int original_width, int ashift,

+ int i, raidz_col_t *rc)

+ uint64_t sector_id =

+ physical_width * (rc->rc_offset >> ashift) +

+ rc->rc_devidx;

+ for (int w = physical_width; w >= original_width; w--) {

+ if (i < w) {

+ return (sector_id % w == i);

+ } else {

+ i -= w;

+ }

+ ASSERT(!"invalid logical child id");

+ return (B_FALSE);

+/*

* returns EINVAL if reconstruction of the block will not be possible

* returns ECKSUM if this specific reconstruction failed

* returns 0 on successful reconstruction

@@ -1965,6 +2890,15 @@ static int

raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

{

raidz_map_t *rm = zio->io_vsd;

+ int physical_width = zio->io_vd->vdev_children;

+ int original_width = (rm->rm_original_width != 0) ?

+ rm->rm_original_width : physical_width;

+ int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;

+ if (dbgmsg) {

+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "

+ "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);

+ }

/* Reconstruct each row */

for (int r = 0; r < rm->rm_nrows; r++) {

@@ -1974,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

int dead = 0;

int dead_data = 0;

+ if (dbgmsg)

+ zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);

for (int c = 0; c < rr->rr_cols; c++) {

raidz_col_t *rc = &rr->rr_col[c];

ASSERT0(rc->rc_need_orig_restore);

@@ -1986,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

if (rc->rc_size == 0)

continue;

for (int lt = 0; lt < ntgts; lt++) {

- if (rc->rc_devidx == ltgts[lt]) {

+ if (raidz_simulate_failure(physical_width,

+ original_width,

+ zio->io_vd->vdev_top->vdev_ashift,

+ ltgts[lt], rc)) {

if (rc->rc_orig_data == NULL) {

rc->rc_orig_data =

abd_alloc_linear(

@@ -1999,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

dead++;

if (c >= nparity)

dead_data++;

- my_tgts[t++] = c;

+ /*

+ * Note: simulating failure of a

+ * pre-expansion device can hit more

+ * than one column, in which case we

+ * might try to simulate more failures

+ * than can be reconstructed, which is

+ * also more than the size of my_tgts.

+ * This check prevents accessing past

+ * the end of my_tgts. The "dead >

+ * nparity" check below will fail this

+ * reconstruction attempt.

+ */

+ if (t < VDEV_RAIDZ_MAXPARITY) {

+ my_tgts[t++] = c;

+ if (dbgmsg) {

+ zfs_dbgmsg("simulating "

+ "failure of col %u "

+ "devidx %u", c,

+ (int)rc->rc_devidx);

+ }

break;

}

if (dead > nparity) {

/* reconstruction not possible */

+ if (dbgmsg) {

+ zfs_dbgmsg("reconstruction not possible; "

+ "too many failures");

+ }

raidz_restore_orig_data(rm);

return (EINVAL);

}

@@ -2049,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

zio_checksum_verified(zio);

+ if (dbgmsg) {

+ zfs_dbgmsg("reconstruction successful "

+ "(checksum verified)");

+ }

return (0);

}

/* Reconstruction failed - restore original data */

raidz_restore_orig_data(rm);

+ if (dbgmsg) {

+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "

+ "failed", zio);

+ }

return (ECKSUM);

}

@@ -2068,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

* The order that we find the various possible combinations of failed

* disks is dictated by these rules:

* - Examine each "slot" (the "i" in tgts[i])

- * - Try to increment this slot (tgts[i] = tgts[i] + 1)

+ * - Try to increment this slot (tgts[i] += 1)

* - if we can't increment because it runs into the next slot,

* reset our slot to the minimum, and examine the next slot

@@ -2099,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

* This strategy works for dRAID but is less efficient when there are a large

* number of child vdevs and therefore permutations to check. Furthermore,

- * since the raidz_map_t rows likely do not overlap reconstruction would be

+ * since the raidz_map_t rows likely do not overlap, reconstruction would be

* possible as long as there are no more than nparity data errors per row.

* These additional permutations are not currently checked but could be as

* a future improvement.

+ *

+ * Returns 0 on success, ECKSUM on failure.

static int

vdev_raidz_combrec(zio_t *zio)

{

int nparity = vdev_get_nparity(zio->io_vd);

raidz_map_t *rm = zio->io_vsd;

+ int physical_width = zio->io_vd->vdev_children;

+ int original_width = (rm->rm_original_width != 0) ?

+ rm->rm_original_width : physical_width;

- /* Check if there's enough data to attempt reconstrution. */

for (int i = 0; i < rm->rm_nrows; i++) {

raidz_row_t *rr = rm->rm_row[i];

int total_errors = 0;

@@ -2128,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio)

int tstore[VDEV_RAIDZ_MAXPARITY + 2];

int *ltgts = &tstore[1]; /* value is logical child ID */

- /* Determine number of logical children, n */

- int n = zio->io_vd->vdev_children;

+ /*

+ * Determine number of logical children, n. See comment

+ * above raidz_simulate_failure().

+ */

+ int n = 0;

+ for (int w = physical_width;

+ w >= original_width; w--) {

+ n += w;

+ }

ASSERT3U(num_failures, <=, nparity);

ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);

@@ -2160,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio)

if (ltgts[t] == n) {

/* try more failures */

ASSERT3U(t, ==, num_failures - 1);

+ if (zfs_flags &

+ ZFS_DEBUG_RAIDZ_RECONSTRUCT) {

+ zfs_dbgmsg("reconstruction "

+ "failed for num_failures="

+ "%u; tried all "

+ "combinations",

+ num_failures);

+ }

break;

}

@@ -2171,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio)

* Try the next combination.

if (ltgts[t] != ltgts[t + 1])

- break;

+ break; // found next combination

* Otherwise, reset this tgt to the minimum,

@@ -2186,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio)

break;

}

+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)

+ zfs_dbgmsg("reconstruction failed for all num_failures");

return (ECKSUM);

}

@@ -2211,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)

static void

vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)

{

- int total_errors = 0;

+ int normal_errors = 0;

+ int shadow_errors = 0;

ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);

ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);

@@ -2220,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)

for (int c = 0; c < rr->rr_cols; c++) {

raidz_col_t *rc = &rr->rr_col[c];

- if (rc->rc_error) {

+ if (rc->rc_error != 0) {

ASSERT(rc->rc_error != ECKSUM); /* child has no bp */

- total_errors++;

+ normal_errors++;

+ }

+ if (rc->rc_shadow_error != 0) {

+ ASSERT(rc->rc_shadow_error != ECKSUM);

+ shadow_errors++;

}

* Treat partial writes as a success. If we couldn't write enough

- * columns to reconstruct the data, the I/O failed. Otherwise,

- * good enough.

+ * columns to reconstruct the data, the I/O failed. Otherwise, good

+ * enough. Note that in the case of a shadow write (during raidz

+ * expansion), depending on if we crash, either the normal (old) or

+ * shadow (new) location may become the "real" version of the block,

+ * so both locations must have sufficient redundancy.

* Now that we support write reallocation, it would be better

* to treat partial failure as real failure unless there are

* no non-degraded top-level vdevs left, and not update DTLs

* if we intend to reallocate.

- if (total_errors > rr->rr_firstdatacol) {

+ if (normal_errors > rr->rr_firstdatacol ||

+ shadow_errors > rr->rr_firstdatacol) {

zio->io_error = zio_worst_error(zio->io_error,

vdev_raidz_worst_error(rr));

}

@@ -2254,7 +3255,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,

ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);

ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);

- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);

for (int c = 0; c < rr->rr_cols; c++) {

raidz_col_t *rc = &rr->rr_col[c];

@@ -2337,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)

* for a normal read then allocate an ABD for them now so they

* may be read, verified, and any needed repairs performed.

- if (rr->rr_nempty && rr->rr_abd_empty == NULL)

+ if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)

vdev_draid_map_alloc_empty(zio, rr);

for (int c = 0; c < rr->rr_cols; c++) {

@@ -2395,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio)

{

raidz_map_t *rm = zio->io_vsd;

+ ASSERT(zio->io_bp != NULL);

if (zio->io_type == ZIO_TYPE_WRITE) {

for (int i = 0; i < rm->rm_nrows; i++) {

vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);

}

} else {

+ if (rm->rm_phys_col) {

+ /*

+ * This is an aggregated read. Copy the data and status

+ * from the aggregate abd's to the individual rows.

+ */

+ for (int i = 0; i < rm->rm_nrows; i++) {

+ raidz_row_t *rr = rm->rm_row[i];

+ for (int c = 0; c < rr->rr_cols; c++) {

+ raidz_col_t *rc = &rr->rr_col[c];

+ if (rc->rc_tried || rc->rc_size == 0)

+ continue;

+ raidz_col_t *prc =

+ &rm->rm_phys_col[rc->rc_devidx];

+ rc->rc_error = prc->rc_error;

+ rc->rc_tried = prc->rc_tried;

+ rc->rc_skipped = prc->rc_skipped;

+ if (c >= rr->rr_firstdatacol) {

+ /*

+ * Note: this is slightly faster

+ * than using abd_copy_off().

+ */

+ char *physbuf = abd_to_buf(

+ prc->rc_abd);

+ void *physloc = physbuf +

+ rc->rc_offset -

+ prc->rc_offset;

+ abd_copy_from_buf(rc->rc_abd,

+ physloc, rc->rc_size);

+ }

for (int i = 0; i < rm->rm_nrows; i++) {

raidz_row_t *rr = rm->rm_row[i];

vdev_raidz_io_done_reconstruct_known_missing(zio,

@@ -2446,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio)

zio_vdev_io_redone(zio);

return;

}

+ /*

+ * It would be too expensive to try every possible

+ * combination of failed sectors in every row, so

+ * instead we try every combination of failed current or

+ * past physical disk. This means that if the incorrect

+ * sectors were all on Nparity disks at any point in the

+ * past, we will find the correct data. The only known

+ * case where this is less durable than a non-expanded

+ * RAIDZ, is if we have a silent failure during

+ * expansion. In that case, one block could be

+ * partially in the old format and partially in the

+ * new format, so we'd lost some sectors from the old

+ * format and some from the new format.

+ *

+ * e.g. logical_width=4 physical_width=6

+ * the 15 (6+5+4) possible failed disks are:

+ * width=6 child=0

+ * width=6 child=1

+ * width=6 child=2

+ * width=6 child=3

+ * width=6 child=4

+ * width=6 child=5

+ * width=5 child=0

+ * width=5 child=1

+ * width=5 child=2

+ * width=5 child=3

+ * width=5 child=4

+ * width=4 child=0

+ * width=4 child=1

+ * width=4 child=2

+ * width=4 child=3

+ * And we will try every combination of Nparity of these

+ * failing.

+ *

+ * As a first pass, we can generate every combo,

+ * and try reconstructing, ignoring any known

+ * failures. If any row has too many known + simulated

+ * failures, then we bail on reconstructing with this

+ * number of simulated failures. As an improvement,

+ * we could detect the number of whole known failures

+ * (i.e. we have known failures on these disks for

+ * every row; the disks never succeeded), and

+ * subtract that from the max # failures to simulate.

+ * We could go even further like the current

+ * combrec code, but that doesn't seem like it

+ * gains us very much. If we simulate a failure

+ * that is also a known failure, that's fine.

+ */

zio->io_error = vdev_raidz_combrec(zio);

if (zio->io_error == ECKSUM &&

!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {

@@ -2454,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio)

}

+ if (rm->rm_lr != NULL) {

+ zfs_rangelock_exit(rm->rm_lr);

+ rm->rm_lr = NULL;

+ }

}

static void

@@ -2480,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,

uint64_t phys_birth)

{

vdev_raidz_t *vdrz = vd->vdev_tsd;

+ /*

+ * If we're in the middle of a RAIDZ expansion, this block may be in

+ * the old and/or new location. For simplicity, always resilver it.

+ */

+ if (vdrz->vn_vre.vre_state == DSS_SCANNING)

+ return (B_TRUE);

uint64_t dcols = vd->vdev_children;

uint64_t nparity = vdrz->vd_nparity;

uint64_t ashift = vd->vdev_top->vdev_ashift;

@@ -2524,7 +3620,24 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,

vdev_t *raidvd = cvd->vdev_parent;

ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);

- uint64_t width = raidvd->vdev_children;

+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;

+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {

+ /*

+ * We're in the middle of expansion, in which case the

+ * translation is in flux. Any answer we give may be wrong

+ * by the time we return, so it isn't safe for the caller to

+ * act on it. Therefore we say that this range isn't present

+ * on any children. The only consumers of this are "zpool

+ * initialize" and trimming, both of which are "best effort"

+ * anyway.

+ */

+ physical_rs->rs_start = physical_rs->rs_end = 0;

+ remain_rs->rs_start = remain_rs->rs_end = 0;

+ return;

+ }

+ uint64_t width = vdrz->vd_physical_width;

uint64_t tgt_col = cvd->vdev_id;

uint64_t ashift = raidvd->vdev_top->vdev_ashift;

@@ -2550,15 +3663,1155 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,

logical_rs->rs_end - logical_rs->rs_start);

}

+static void

+raidz_reflow_sync(void *arg, dmu_tx_t *tx)

+ spa_t *spa = arg;

+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ /*

+ * Ensure there are no i/os to the range that is being committed.

+ */

+ uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);

+ ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);

+ mutex_enter(&vre->vre_lock);

+ uint64_t new_offset =

+ MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);

+ /*

+ * We should not have committed anything that failed.

+ */

+ VERIFY3U(vre->vre_failed_offset, >=, old_offset);

+ mutex_exit(&vre->vre_lock);

+ zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,

+ old_offset, new_offset - old_offset,

+ RL_WRITER);

+ /*

+ * Update the uberblock that will be written when this txg completes.

+ */

+ RAIDZ_REFLOW_SET(&spa->spa_uberblock,

+ RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);

+ vre->vre_offset_pertxg[txgoff] = 0;

+ zfs_rangelock_exit(lr);

+ mutex_enter(&vre->vre_lock);

+ vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];

+ vre->vre_bytes_copied_pertxg[txgoff] = 0;

+ mutex_exit(&vre->vre_lock);

+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ VERIFY0(zap_update(spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,

+ sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));

+static void

+raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)

+ spa_t *spa = arg;

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;

+ for (int i = 0; i < TXG_SIZE; i++)

+ VERIFY0(vre->vre_offset_pertxg[i]);

+ reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);

+ re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;

+ re->re_logical_width = vdrz->vd_physical_width;

+ mutex_enter(&vdrz->vd_expand_lock);

+ avl_add(&vdrz->vd_expand_txgs, re);

+ mutex_exit(&vdrz->vd_expand_lock);

+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ /*

+ * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS

+ * will get written (based on vd_expand_txgs).

+ */

+ vdev_config_dirty(vd);

+ /*

+ * Before we change vre_state, the on-disk state must reflect that we

+ * have completed all copying, so that vdev_raidz_io_start() can use

+ * vre_state to determine if the reflow is in progress. See also the

+ * end of spa_raidz_expand_thread().

+ */

+ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,

+ raidvd->vdev_ms_count << raidvd->vdev_ms_shift);

+ vre->vre_end_time = gethrestime_sec();

+ vre->vre_state = DSS_FINISHED;

+ uint64_t state = vre->vre_state;

+ VERIFY0(zap_update(spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,

+ sizeof (state), 1, &state, tx));

+ uint64_t end_time = vre->vre_end_time;

+ VERIFY0(zap_update(spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,

+ sizeof (end_time), 1, &end_time, tx));

+ spa->spa_uberblock.ub_raidz_reflow_info = 0;

+ spa_history_log_internal(spa, "raidz vdev expansion completed", tx,

+ "%s vdev %llu new width %llu", spa_name(spa),

+ (unsigned long long)vd->vdev_id,

+ (unsigned long long)vd->vdev_children);

+ spa->spa_raidz_expand = NULL;

+ raidvd->vdev_rz_expanding = B_FALSE;

+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);

+ spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);

+ spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);

+ spa_notify_waiters(spa);

+ /*

+ * While we're in syncing context take the opportunity to

+ * setup a scrub. All the data has been sucessfully copied

+ * but we have not validated any checksums.

+ */

+ pool_scan_func_t func = POOL_SCAN_SCRUB;

+ if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)

+ dsl_scan_setup_sync(&func, tx);

+/*

+ * Struct for one copy zio.

+ */

+typedef struct raidz_reflow_arg {

+ vdev_raidz_expand_t *rra_vre;

+ zfs_locked_range_t *rra_lr;

+ uint64_t rra_txg;

+} raidz_reflow_arg_t;

+/*

+ * The write of the new location is done.

+ */

+static void

+raidz_reflow_write_done(zio_t *zio)

+ raidz_reflow_arg_t *rra = zio->io_private;

+ vdev_raidz_expand_t *vre = rra->rra_vre;

+ abd_free(zio->io_abd);

+ mutex_enter(&vre->vre_lock);

+ if (zio->io_error != 0) {

+ /* Force a reflow pause on errors */

+ vre->vre_failed_offset =

+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);

+ }

+ ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);

+ vre->vre_outstanding_bytes -= zio->io_size;

+ if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <

+ vre->vre_failed_offset) {

+ vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=

+ zio->io_size;

+ }

+ cv_signal(&vre->vre_cv);

+ mutex_exit(&vre->vre_lock);

+ zfs_rangelock_exit(rra->rra_lr);

+ kmem_free(rra, sizeof (*rra));

+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);

+/*

+ * The read of the old location is done. The parent zio is the write to

+ * the new location. Allow it to start.

+ */

+static void

+raidz_reflow_read_done(zio_t *zio)

+ raidz_reflow_arg_t *rra = zio->io_private;

+ vdev_raidz_expand_t *vre = rra->rra_vre;

+ /*

+ * If the read failed, or if it was done on a vdev that is not fully

+ * healthy (e.g. a child that has a resilver in progress), we may not

+ * have the correct data. Note that it's OK if the write proceeds.

+ * It may write garbage but the location is otherwise unused and we

+ * will retry later due to vre_failed_offset.

+ */

+ if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {

+ zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "

+ "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",

+ (long long)rra->rra_lr->lr_offset,

+ (long long)rra->rra_lr->lr_length,

+ (long long)rra->rra_txg,

+ zio->io_error,

+ vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),

+ vdev_dtl_empty(zio->io_vd, DTL_MISSING));

+ mutex_enter(&vre->vre_lock);

+ /* Force a reflow pause on errors */

+ vre->vre_failed_offset =

+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);

+ mutex_exit(&vre->vre_lock);

+ }

+ zio_nowait(zio_unique_parent(zio));

+static void

+raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,

+ dmu_tx_t *tx)

+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;

+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;

+ if (offset == 0)

+ return;

+ mutex_enter(&vre->vre_lock);

+ ASSERT3U(vre->vre_offset, <=, offset);

+ vre->vre_offset = offset;

+ mutex_exit(&vre->vre_lock);

+ if (vre->vre_offset_pertxg[txgoff] == 0) {

+ dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,

+ spa, tx);

+ }

+ vre->vre_offset_pertxg[txgoff] = offset;

+static boolean_t

+vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)

+ for (int i = 0; i < raidz_vd->vdev_children; i++) {

+ /* Quick check if a child is being replaced */

+ if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)

+ return (B_TRUE);

+ }

+ return (B_FALSE);

+static boolean_t

+raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,

+ dmu_tx_t *tx)

+ spa_t *spa = vd->vdev_spa;

+ int ashift = vd->vdev_top->vdev_ashift;

+ uint64_t offset, size;

+ if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,

+ &offset, &size)) {

+ return (B_FALSE);

+ }

+ ASSERT(IS_P2ALIGNED(offset, 1 << ashift));

+ ASSERT3U(size, >=, 1 << ashift);

+ uint64_t length = 1 << ashift;

+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;

+ uint64_t blkid = offset >> ashift;

+ int old_children = vd->vdev_children - 1;

+ /*

+ * We can only progress to the point that writes will not overlap

+ * with blocks whose progress has not yet been recorded on disk.

+ * Since partially-copied rows are still read from the old location,

+ * we need to stop one row before the sector-wise overlap, to prevent

+ * row-wise overlap.

+ *

+ * Note that even if we are skipping over a large unallocated region,

+ * we can't move the on-disk progress to `offset`, because concurrent

+ * writes/allocations could still use the currently-unallocated

+ * region.

+ */

+ uint64_t ubsync_blkid =

+ RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;

+ uint64_t next_overwrite_blkid = ubsync_blkid +

+ ubsync_blkid / old_children - old_children;

+ VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);

+ if (blkid >= next_overwrite_blkid) {

+ raidz_reflow_record_progress(vre,

+ next_overwrite_blkid << ashift, tx);

+ return (B_TRUE);

+ }

+ range_tree_remove(rt, offset, length);

+ raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);

+ rra->rra_vre = vre;

+ rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,

+ offset, length, RL_WRITER);

+ rra->rra_txg = dmu_tx_get_txg(tx);

+ raidz_reflow_record_progress(vre, offset + length, tx);

+ mutex_enter(&vre->vre_lock);

+ vre->vre_outstanding_bytes += length;

+ mutex_exit(&vre->vre_lock);

+ /*

+ * SCL_STATE will be released when the read and write are done,

+ * by raidz_reflow_write_done().

+ */

+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);

+ /* check if a replacing vdev was added, if so treat it as an error */

+ if (vdev_raidz_expand_child_replacing(vd)) {

+ zfs_dbgmsg("replacing vdev encountered, reflow paused at "

+ "offset=%llu txg=%llu",

+ (long long)rra->rra_lr->lr_offset,

+ (long long)rra->rra_txg);

+ mutex_enter(&vre->vre_lock);

+ vre->vre_failed_offset =

+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);

+ cv_signal(&vre->vre_cv);

+ mutex_exit(&vre->vre_lock);

+ /* drop everything we acquired */

+ zfs_rangelock_exit(rra->rra_lr);

+ kmem_free(rra, sizeof (*rra));

+ spa_config_exit(spa, SCL_STATE, spa);

+ return (B_TRUE);

+ }

+ zio_t *pio = spa->spa_txg_zio[txgoff];

+ abd_t *abd = abd_alloc_for_io(length, B_FALSE);

+ zio_t *write_zio = zio_vdev_child_io(pio, NULL,

+ vd->vdev_child[blkid % vd->vdev_children],

+ (blkid / vd->vdev_children) << ashift,

+ abd, length,

+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,

+ ZIO_FLAG_CANFAIL,

+ raidz_reflow_write_done, rra);

+ zio_nowait(zio_vdev_child_io(write_zio, NULL,

+ vd->vdev_child[blkid % old_children],

+ (blkid / old_children) << ashift,

+ abd, length,

+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,

+ ZIO_FLAG_CANFAIL,

+ raidz_reflow_read_done, rra));

+ return (B_FALSE);

+/*

+ * For testing (ztest specific)

+ */

+static void

+raidz_expand_pause(uint_t pause_point)

+ while (raidz_expand_pause_point != 0 &&

+ raidz_expand_pause_point <= pause_point)

+ delay(hz);

+static void

+raidz_scratch_child_done(zio_t *zio)

+ zio_t *pio = zio->io_private;

+ mutex_enter(&pio->io_lock);

+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);

+ mutex_exit(&pio->io_lock);

+/*

+ * Reflow the beginning portion of the vdev into an intermediate scratch area

+ * in memory and on disk. This operation must be persisted on disk before we

+ * proceed to overwrite the beginning portion with the reflowed data.

+ *

+ * This multi-step task can fail to complete if disk errors are encountered

+ * and we can return here after a pause (waiting for disk to become healthy).

+ */

+static void

+raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)

+ vdev_raidz_expand_t *vre = arg;

+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;

+ zio_t *pio;

+ int error;

+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ int ashift = raidvd->vdev_ashift;

+ uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift);

+ uint64_t logical_size = write_size * raidvd->vdev_children;

+ uint64_t read_size =

+ P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),

+ 1 << ashift);

+ /*

+ * The scratch space must be large enough to get us to the point

+ * that one row does not overlap itself when moved. This is checked

+ * by vdev_raidz_attach_check().

+ */

+ VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);

+ VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);

+ VERIFY3U(write_size, <=, read_size);

+ zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,

+ 0, logical_size, RL_WRITER);

+ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),

+ KM_SLEEP);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ abds[i] = abd_alloc_linear(read_size, B_FALSE);

+ }

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);

+ /*

+ * If we have already written the scratch area then we must read from

+ * there, since new writes were redirected there while we were paused

+ * or the original location may have been partially overwritten with

+ * reflowed data.

+ */

+ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {

+ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);

+ /*

+ * Read from scratch space.

+ */

+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ /*

+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE

+ * to the offset to calculate the physical offset to

+ * write to. Passing in a negative offset makes us

+ * access the scratch area.

+ */

+ zio_nowait(zio_vdev_child_io(pio, NULL,

+ raidvd->vdev_child[i],

+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],

+ write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,

+ ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));

+ }

+ error = zio_wait(pio);

+ if (error != 0) {

+ zfs_dbgmsg("reflow: error %d reading scratch location",

+ error);

+ goto io_error_exit;

+ }

+ goto overwrite;

+ }

+ /*

+ * Read from original location.

+ */

+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

+ for (int i = 0; i < raidvd->vdev_children - 1; i++) {

+ ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));

+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],

+ 0, abds[i], read_size, ZIO_TYPE_READ,

+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,

+ raidz_scratch_child_done, pio));

+ }

+ error = zio_wait(pio);

+ if (error != 0) {

+ zfs_dbgmsg("reflow: error %d reading original location", error);

+io_error_exit:

+ for (int i = 0; i < raidvd->vdev_children; i++)

+ abd_free(abds[i]);

+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));

+ zfs_rangelock_exit(lr);

+ spa_config_exit(spa, SCL_STATE, FTAG);

+ return;

+ }

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);

+ /*

+ * Reflow in memory.

+ */

+ uint64_t logical_sectors = logical_size >> ashift;

+ for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {

+ int oldchild = i % (raidvd->vdev_children - 1);

+ uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;

+ int newchild = i % raidvd->vdev_children;

+ uint64_t newoff = (i / raidvd->vdev_children) << ashift;

+ /* a single sector should not be copying over itself */

+ ASSERT(!(newchild == oldchild && newoff == oldoff));

+ abd_copy_off(abds[newchild], abds[oldchild],

+ newoff, oldoff, 1 << ashift);

+ }

+ /*

+ * Verify that we filled in everything we intended to (write_size on

+ * each child).

+ */

+ VERIFY0(logical_sectors % raidvd->vdev_children);

+ VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,

+ write_size);

+ /*

+ * Write to scratch location (boot area).

+ */

+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ /*

+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to

+ * the offset to calculate the physical offset to write to.

+ * Passing in a negative offset lets us access the boot area.

+ */

+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],

+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],

+ write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,

+ ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));

+ }

+ error = zio_wait(pio);

+ if (error != 0) {

+ zfs_dbgmsg("reflow: error %d writing scratch location", error);

+ goto io_error_exit;

+ }

+ pio = zio_root(spa, NULL, NULL, 0);

+ zio_flush(pio, raidvd);

+ zio_wait(pio);

+ zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",

+ (long long)logical_size);

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);

+ /*

+ * Update uberblock to indicate that scratch space is valid. This is

+ * needed because after this point, the real location may be

+ * overwritten. If we crash, we need to get the data from the

+ * scratch space, rather than the real location.

+ *

+ * Note: ub_timestamp is bumped so that vdev_uberblock_compare()

+ * will prefer this uberblock.

+ */

+ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);

+ spa->spa_ubsync.ub_timestamp++;

+ ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,

+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));

+ if (spa_multihost(spa))

+ mmp_update_uberblock(spa, &spa->spa_ubsync);

+ zfs_dbgmsg("reflow: uberblock updated "

+ "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",

+ (long long)spa->spa_ubsync.ub_txg,

+ (long long)logical_size,

+ (long long)spa->spa_ubsync.ub_timestamp);

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);

+ /*

+ * Overwrite with reflow'ed data.

+ */

+overwrite:

+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],

+ 0, abds[i], write_size, ZIO_TYPE_WRITE,

+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,

+ raidz_scratch_child_done, pio));

+ }

+ error = zio_wait(pio);

+ if (error != 0) {

+ /*

+ * When we exit early here and drop the range lock, new

+ * writes will go into the scratch area so we'll need to

+ * read from there when we return after pausing.

+ */

+ zfs_dbgmsg("reflow: error %d writing real location", error);

+ /*

+ * Update the uberblock that is written when this txg completes.

+ */

+ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,

+ logical_size);

+ goto io_error_exit;

+ }

+ pio = zio_root(spa, NULL, NULL, 0);

+ zio_flush(pio, raidvd);

+ zio_wait(pio);

+ zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",

+ (long long)logical_size);

+ for (int i = 0; i < raidvd->vdev_children; i++)

+ abd_free(abds[i]);

+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);

+ /*

+ * Update uberblock to indicate that the initial part has been

+ * reflow'ed. This is needed because after this point (when we exit

+ * the rangelock), we allow regular writes to this region, which will

+ * be written to the new location only (because reflow_offset_next ==

+ * reflow_offset_synced). If we crashed and re-copied from the

+ * scratch space, we would lose the regular writes.

+ */

+ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,

+ logical_size);

+ spa->spa_ubsync.ub_timestamp++;

+ ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,

+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));

+ if (spa_multihost(spa))

+ mmp_update_uberblock(spa, &spa->spa_ubsync);

+ zfs_dbgmsg("reflow: uberblock updated "

+ "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",

+ (long long)spa->spa_ubsync.ub_txg,

+ (long long)logical_size,

+ (long long)spa->spa_ubsync.ub_timestamp);

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);

+ /*

+ * Update progress.

+ */

+ vre->vre_offset = logical_size;

+ zfs_rangelock_exit(lr);

+ spa_config_exit(spa, SCL_STATE, FTAG);

+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;

+ vre->vre_offset_pertxg[txgoff] = vre->vre_offset;

+ vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;

+ /*

+ * Note - raidz_reflow_sync() will update the uberblock state to

+ * RRSS_SCRATCH_INVALID_SYNCED_REFLOW

+ */

+ raidz_reflow_sync(spa, tx);

+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);

+/*

+ * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work

+ * here. No other i/o can be in progress, so we don't need the vre_rangelock.

+ */

+void

+vdev_raidz_reflow_copy_scratch(spa_t *spa)

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);

+ ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);

+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ ASSERT0(logical_size % raidvd->vdev_children);

+ uint64_t write_size = logical_size / raidvd->vdev_children;

+ zio_t *pio;

+ /*

+ * Read from scratch space.

+ */

+ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),

+ KM_SLEEP);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ abds[i] = abd_alloc_linear(write_size, B_FALSE);

+ }

+ pio = zio_root(spa, NULL, NULL, 0);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ /*

+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to

+ * the offset to calculate the physical offset to write to.

+ * Passing in a negative offset lets us access the boot area.

+ */

+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],

+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],

+ write_size, ZIO_TYPE_READ,

+ ZIO_PRIORITY_ASYNC_READ, 0,

+ raidz_scratch_child_done, pio));

+ }

+ zio_wait(pio);

+ /*

+ * Overwrite real location with reflow'ed data.

+ */

+ pio = zio_root(spa, NULL, NULL, 0);

+ for (int i = 0; i < raidvd->vdev_children; i++) {

+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],

+ 0, abds[i], write_size, ZIO_TYPE_WRITE,

+ ZIO_PRIORITY_ASYNC_WRITE, 0,

+ raidz_scratch_child_done, pio));

+ }

+ zio_wait(pio);

+ pio = zio_root(spa, NULL, NULL, 0);

+ zio_flush(pio, raidvd);

+ zio_wait(pio);

+ zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "

+ "to real location", (long long)logical_size);

+ for (int i = 0; i < raidvd->vdev_children; i++)

+ abd_free(abds[i]);

+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));

+ /*

+ * Update uberblock.

+ */

+ RAIDZ_REFLOW_SET(&spa->spa_ubsync,

+ RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);

+ spa->spa_ubsync.ub_timestamp++;

+ VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,

+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));

+ if (spa_multihost(spa))

+ mmp_update_uberblock(spa, &spa->spa_ubsync);

+ zfs_dbgmsg("reflow recovery: uberblock updated "

+ "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",

+ (long long)spa->spa_ubsync.ub_txg,

+ (long long)logical_size,

+ (long long)spa->spa_ubsync.ub_timestamp);

+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,

+ spa_first_txg(spa));

+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;

+ vre->vre_offset = logical_size;

+ vre->vre_offset_pertxg[txgoff] = vre->vre_offset;

+ vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;

+ /*

+ * Note that raidz_reflow_sync() will update the uberblock once more

+ */

+ raidz_reflow_sync(spa, tx);

+ dmu_tx_commit(tx);

+ spa_config_exit(spa, SCL_STATE, FTAG);

+static boolean_t

+spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)

+ (void) zthr;

+ spa_t *spa = arg;

+ return (spa->spa_raidz_expand != NULL &&

+ !spa->spa_raidz_expand->vre_waiting_for_resilver);

+/*

+ * RAIDZ expansion background thread

+ *

+ * Can be called multiple times if the reflow is paused

+ */

+static void

+spa_raidz_expand_thread(void *arg, zthr_t *zthr)

+ spa_t *spa = arg;

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)

+ vre->vre_offset = 0;

+ else

+ vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);

+ /* Reflow the begining portion using the scratch area */

+ if (vre->vre_offset == 0) {

+ VERIFY0(dsl_sync_task(spa_name(spa),

+ NULL, raidz_reflow_scratch_sync,

+ vre, 0, ZFS_SPACE_CHECK_NONE));

+ /* if we encountered errors then pause */

+ if (vre->vre_offset == 0) {

+ mutex_enter(&vre->vre_lock);

+ vre->vre_waiting_for_resilver = B_TRUE;

+ mutex_exit(&vre->vre_lock);

+ return;

+ }

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ uint64_t guid = raidvd->vdev_guid;

+ /* Iterate over all the remaining metaslabs */

+ for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;

+ i < raidvd->vdev_ms_count &&

+ !zthr_iscancelled(zthr) &&

+ vre->vre_failed_offset == UINT64_MAX; i++) {

+ metaslab_t *msp = raidvd->vdev_ms[i];

+ metaslab_disable(msp);

+ mutex_enter(&msp->ms_lock);

+ /*

+ * The metaslab may be newly created (for the expanded

+ * space), in which case its trees won't exist yet,

+ * so we need to bail out early.

+ */

+ if (msp->ms_new) {

+ mutex_exit(&msp->ms_lock);

+ metaslab_enable(msp, B_FALSE, B_FALSE);

+ continue;

+ }

+ VERIFY0(metaslab_load(msp));

+ /*

+ * We want to copy everything except the free (allocatable)

+ * space. Note that there may be a little bit more free

+ * space (e.g. in ms_defer), and it's fine to copy that too.

+ */

+ range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,

+ NULL, 0, 0);

+ range_tree_add(rt, msp->ms_start, msp->ms_size);

+ range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);

+ mutex_exit(&msp->ms_lock);

+ /*

+ * Force the last sector of each metaslab to be copied. This

+ * ensures that we advance the on-disk progress to the end of

+ * this metaslab while the metaslab is disabled. Otherwise, we

+ * could move past this metaslab without advancing the on-disk

+ * progress, and then an allocation to this metaslab would not

+ * be copied.

+ */

+ int sectorsz = 1 << raidvd->vdev_ashift;

+ uint64_t ms_last_offset = msp->ms_start +

+ msp->ms_size - sectorsz;

+ if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {

+ range_tree_add(rt, ms_last_offset, sectorsz);

+ }

+ /*

+ * When we are resuming from a paused expansion (i.e.

+ * when importing a pool with a expansion in progress),

+ * discard any state that we have already processed.

+ */

+ range_tree_clear(rt, 0, vre->vre_offset);

+ while (!zthr_iscancelled(zthr) &&

+ !range_tree_is_empty(rt) &&

+ vre->vre_failed_offset == UINT64_MAX) {

+ /*

+ * We need to periodically drop the config lock so that

+ * writers can get in. Additionally, we can't wait

+ * for a txg to sync while holding a config lock

+ * (since a waiting writer could cause a 3-way deadlock

+ * with the sync thread, which also gets a config

+ * lock for reader). So we can't hold the config lock

+ * while calling dmu_tx_assign().

+ */

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ /*

+ * If requested, pause the reflow when the amount

+ * specified by raidz_expand_max_reflow_bytes is reached

+ *

+ * This pause is only used during testing or debugging.

+ */

+ while (raidz_expand_max_reflow_bytes != 0 &&

+ raidz_expand_max_reflow_bytes <=

+ vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {

+ delay(hz);

+ }

+ mutex_enter(&vre->vre_lock);

+ while (vre->vre_outstanding_bytes >

+ raidz_expand_max_copy_bytes) {

+ cv_wait(&vre->vre_cv, &vre->vre_lock);

+ }

+ mutex_exit(&vre->vre_lock);

+ dmu_tx_t *tx =

+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);

+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));

+ uint64_t txg = dmu_tx_get_txg(tx);

+ /*

+ * Reacquire the vdev_config lock. Theoretically, the

+ * vdev_t that we're expanding may have changed.

+ */

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ boolean_t needsync =

+ raidz_reflow_impl(raidvd, vre, rt, tx);

+ dmu_tx_commit(tx);

+ if (needsync) {

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ txg_wait_synced(spa->spa_dsl_pool, txg);

+ spa_config_enter(spa, SCL_CONFIG, FTAG,

+ RW_READER);

+ }

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ metaslab_enable(msp, B_FALSE, B_FALSE);

+ range_tree_vacate(rt, NULL, NULL);

+ range_tree_destroy(rt);

+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ }

+ spa_config_exit(spa, SCL_CONFIG, FTAG);

+ /*

+ * The txg_wait_synced() here ensures that all reflow zio's have

+ * completed, and vre_failed_offset has been set if necessary. It

+ * also ensures that the progress of the last raidz_reflow_sync() is

+ * written to disk before raidz_reflow_complete_sync() changes the

+ * in-memory vre_state. vdev_raidz_io_start() uses vre_state to

+ * determine if a reflow is in progress, in which case we may need to

+ * write to both old and new locations. Therefore we can only change

+ * vre_state once this is not necessary, which is once the on-disk

+ * progress (in spa_ubsync) has been set past any possible writes (to

+ * the end of the last metaslab).

+ */

+ txg_wait_synced(spa->spa_dsl_pool, 0);

+ if (!zthr_iscancelled(zthr) &&

+ vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {

+ /*

+ * We are not being canceled or paused, so the reflow must be

+ * complete. In that case also mark it as completed on disk.

+ */

+ ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);

+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,

+ raidz_reflow_complete_sync, spa,

+ 0, ZFS_SPACE_CHECK_NONE));

+ (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);

+ } else {

+ /*

+ * Wait for all copy zio's to complete and for all the

+ * raidz_reflow_sync() synctasks to be run.

+ */

+ spa_history_log_internal(spa, "reflow pause",

+ NULL, "offset=%llu failed_offset=%lld",

+ (long long)vre->vre_offset,

+ (long long)vre->vre_failed_offset);

+ mutex_enter(&vre->vre_lock);

+ if (vre->vre_failed_offset != UINT64_MAX) {

+ /*

+ * Reset progress so that we will retry everything

+ * after the point that something failed.

+ */

+ vre->vre_offset = vre->vre_failed_offset;

+ vre->vre_failed_offset = UINT64_MAX;

+ vre->vre_waiting_for_resilver = B_TRUE;

+ }

+ mutex_exit(&vre->vre_lock);

+ }

+void

+spa_start_raidz_expansion_thread(spa_t *spa)

+ ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);

+ spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",

+ spa_raidz_expand_thread_check, spa_raidz_expand_thread,

+ spa, defclsyspri);

+void

+raidz_dtl_reassessed(vdev_t *vd)

+ spa_t *spa = vd->vdev_spa;

+ if (spa->spa_raidz_expand != NULL) {

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ /*

+ * we get called often from vdev_dtl_reassess() so make

+ * sure it's our vdev and any replacing is complete

+ */

+ if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&

+ !vdev_raidz_expand_child_replacing(vd->vdev_top)) {

+ mutex_enter(&vre->vre_lock);

+ if (vre->vre_waiting_for_resilver) {

+ vdev_dbgmsg(vd, "DTL reassessed, "

+ "continuing raidz expansion");

+ vre->vre_waiting_for_resilver = B_FALSE;

+ zthr_wakeup(spa->spa_raidz_expand_zthr);

+ }

+ mutex_exit(&vre->vre_lock);

+ }

+int

+vdev_raidz_attach_check(vdev_t *new_child)

+ vdev_t *raidvd = new_child->vdev_parent;

+ uint64_t new_children = raidvd->vdev_children;

+ /*

+ * We use the "boot" space as scratch space to handle overwriting the

+ * initial part of the vdev. If it is too small, then this expansion

+ * is not allowed. This would be very unusual (e.g. ashift > 13 and

+ * >200 children).

+ */

+ if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {

+ return (EINVAL);

+ }

+ return (0);

+void

+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)

+ vdev_t *new_child = arg;

+ spa_t *spa = new_child->vdev_spa;

+ vdev_t *raidvd = new_child->vdev_parent;

+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;

+ ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);

+ ASSERT3P(raidvd->vdev_top, ==, raidvd);

+ ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);

+ ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);

+ ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,

+ new_child);

+ spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);

+ vdrz->vd_physical_width++;

+ VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);

+ vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;

+ vdrz->vn_vre.vre_offset = 0;

+ vdrz->vn_vre.vre_failed_offset = UINT64_MAX;

+ spa->spa_raidz_expand = &vdrz->vn_vre;

+ zthr_wakeup(spa->spa_raidz_expand_zthr);

+ /*

+ * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get

+ * written to the config.

+ */

+ vdev_config_dirty(raidvd);

+ vdrz->vn_vre.vre_start_time = gethrestime_sec();

+ vdrz->vn_vre.vre_end_time = 0;

+ vdrz->vn_vre.vre_state = DSS_SCANNING;

+ vdrz->vn_vre.vre_bytes_copied = 0;

+ uint64_t state = vdrz->vn_vre.vre_state;

+ VERIFY0(zap_update(spa->spa_meta_objset,

+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,

+ sizeof (state), 1, &state, tx));

+ uint64_t start_time = vdrz->vn_vre.vre_start_time;

+ VERIFY0(zap_update(spa->spa_meta_objset,

+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,

+ sizeof (start_time), 1, &start_time, tx));

+ (void) zap_remove(spa->spa_meta_objset,

+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);

+ (void) zap_remove(spa->spa_meta_objset,

+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);

+ spa_history_log_internal(spa, "raidz vdev expansion started", tx,

+ "%s vdev %llu new width %llu", spa_name(spa),

+ (unsigned long long)raidvd->vdev_id,

+ (unsigned long long)raidvd->vdev_children);

+int

+vdev_raidz_load(vdev_t *vd)

+ vdev_raidz_t *vdrz = vd->vdev_tsd;

+ int err;

+ uint64_t state = DSS_NONE;

+ uint64_t start_time = 0;

+ uint64_t end_time = 0;

+ uint64_t bytes_copied = 0;

+ if (vd->vdev_top_zap != 0) {

+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,

+ sizeof (state), 1, &state);

+ if (err != 0 && err != ENOENT)

+ return (err);

+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,

+ sizeof (start_time), 1, &start_time);

+ if (err != 0 && err != ENOENT)

+ return (err);

+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,

+ sizeof (end_time), 1, &end_time);

+ if (err != 0 && err != ENOENT)

+ return (err);

+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,

+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,

+ sizeof (bytes_copied), 1, &bytes_copied);

+ if (err != 0 && err != ENOENT)

+ return (err);

+ }

+ /*

+ * If we are in the middle of expansion, vre_state should have

+ * already been set by vdev_raidz_init().

+ */

+ EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);

+ vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;

+ vdrz->vn_vre.vre_start_time = start_time;

+ vdrz->vn_vre.vre_end_time = end_time;

+ vdrz->vn_vre.vre_bytes_copied = bytes_copied;

+ return (0);

+int

+spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)

+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;

+ if (vre == NULL) {

+ /* no removal in progress; find most recent completed */

+ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {

+ vdev_t *vd = spa->spa_root_vdev->vdev_child[c];

+ if (vd->vdev_ops == &vdev_raidz_ops) {

+ vdev_raidz_t *vdrz = vd->vdev_tsd;

+ if (vdrz->vn_vre.vre_end_time != 0 &&

+ (vre == NULL ||

+ vdrz->vn_vre.vre_end_time >

+ vre->vre_end_time)) {

+ vre = &vdrz->vn_vre;

+ }

+ if (vre == NULL) {

+ return (SET_ERROR(ENOENT));

+ }

+ pres->pres_state = vre->vre_state;

+ pres->pres_expanding_vdev = vre->vre_vdev_id;

+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);

+ pres->pres_to_reflow = vd->vdev_stat.vs_alloc;

+ mutex_enter(&vre->vre_lock);

+ pres->pres_reflowed = vre->vre_bytes_copied;

+ for (int i = 0; i < TXG_SIZE; i++)

+ pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];

+ mutex_exit(&vre->vre_lock);

+ pres->pres_start_time = vre->vre_start_time;

+ pres->pres_end_time = vre->vre_end_time;

+ pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;

+ return (0);

* Initialize private RAIDZ specific fields from the nvlist.

static int

vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)

{

- vdev_raidz_t *vdrz;

- uint64_t nparity;

uint_t children;

nvlist_t **child;

int error = nvlist_lookup_nvlist_array(nv,

@@ -2566,6 +4819,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)

if (error != 0)

return (SET_ERROR(EINVAL));

+ uint64_t nparity;

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {

if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)

return (SET_ERROR(EINVAL));

@@ -2592,10 +4846,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)

nparity = 1;

}

- vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);

- vdrz->vd_logical_width = children;

+ vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);

+ vdrz->vn_vre.vre_vdev_id = -1;

+ vdrz->vn_vre.vre_offset = UINT64_MAX;

+ vdrz->vn_vre.vre_failed_offset = UINT64_MAX;

+ mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);

+ cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);

+ zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);

+ mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);

+ avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,

+ sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));

+ vdrz->vd_physical_width = children;

vdrz->vd_nparity = nparity;

+ /* note, the ID does not exist when creating a pool */

+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,

+ &vdrz->vn_vre.vre_vdev_id);

+ boolean_t reflow_in_progress =

+ nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);

+ if (reflow_in_progress) {

+ spa->spa_raidz_expand = &vdrz->vn_vre;

+ vdrz->vn_vre.vre_state = DSS_SCANNING;

+ }

+ vdrz->vd_original_width = children;

+ uint64_t *txgs;

+ unsigned int txgs_size = 0;

+ error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,

+ &txgs, &txgs_size);

+ if (error == 0) {

+ for (int i = 0; i < txgs_size; i++) {

+ reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);

+ re->re_txg = txgs[txgs_size - i - 1];

+ re->re_logical_width = vdrz->vd_physical_width - i;

+ if (reflow_in_progress)

+ re->re_logical_width--;

+ avl_add(&vdrz->vd_expand_txgs, re);

+ }

+ vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;

+ }

+ if (reflow_in_progress) {

+ vdrz->vd_original_width--;

+ zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",

+ children, txgs_size);

+ }

*tsd = vdrz;

return (0);

@@ -2604,7 +4904,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)

static void

vdev_raidz_fini(vdev_t *vd)

{

- kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));

+ vdev_raidz_t *vdrz = vd->vdev_tsd;

+ if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)

+ vd->vdev_spa->spa_raidz_expand = NULL;

+ reflow_node_t *re;

+ void *cookie = NULL;

+ avl_tree_t *tree = &vdrz->vd_expand_txgs;

+ while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)

+ kmem_free(re, sizeof (*re));

+ avl_destroy(&vdrz->vd_expand_txgs);

+ mutex_destroy(&vdrz->vd_expand_lock);

+ mutex_destroy(&vdrz->vn_vre.vre_lock);

+ cv_destroy(&vdrz->vn_vre.vre_cv);

+ zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);

+ kmem_free(vdrz, sizeof (*vdrz));

}

@@ -2632,6 +4945,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)

* it.

fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);

+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {

+ fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);

+ }

+ mutex_enter(&vdrz->vd_expand_lock);

+ if (!avl_is_empty(&vdrz->vd_expand_txgs)) {

+ uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);

+ uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,

+ KM_SLEEP);

+ uint64_t i = 0;

+ for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);

+ re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {

+ txgs[i++] = re->re_txg;

+ }

+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,

+ txgs, count);

+ kmem_free(txgs, sizeof (uint64_t) * count);

+ }

+ mutex_exit(&vdrz->vd_expand_lock);

}

static uint64_t

@@ -2671,3 +5007,15 @@ vdev_ops_t vdev_raidz_ops = {

.vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */

.vdev_op_leaf = B_FALSE /* not a leaf vdev */

};

+/* BEGIN CSTYLED */

+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,

+ "For testing, pause RAIDZ expansion after reflowing this many bytes");

+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,

+ "Max amount of concurrent i/o for RAIDZ expansion");

+ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,

+ "For expanded RAIDZ, aggregate reads that have more rows than this");

+ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,

+ "For expanded RAIDZ, automatically start a pool scrub when expansion "

+ "completes");

+/* END CSTYLED */

diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index 03e17db024ea..1c54eae40355 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c

@@ -169,7 +169,8 @@ static boolean_t

vdev_trim_should_stop(vdev_t *vd)

{

return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||

- vd->vdev_detached || vd->vdev_top->vdev_removing);

+ vd->vdev_detached || vd->vdev_top->vdev_removing ||

+ vd->vdev_top->vdev_rz_expanding);

}

@@ -180,6 +181,7 @@ vdev_autotrim_should_stop(vdev_t *tvd)

{

return (tvd->vdev_autotrim_exit_wanted ||

!vdev_writeable(tvd) || tvd->vdev_removing ||

+ tvd->vdev_rz_expanding ||

spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);

}

@@ -222,7 +224,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)

kmem_free(arg, sizeof (uint64_t));

vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);

- if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))

+ if (vd == NULL || vd->vdev_top->vdev_removing ||

+ !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)

return;

uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];

@@ -1005,6 +1008,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)

ASSERT(!vd->vdev_detached);

ASSERT(!vd->vdev_trim_exit_wanted);

ASSERT(!vd->vdev_top->vdev_removing);

+ ASSERT(!vd->vdev_rz_expanding);

vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);

vd->vdev_trim_thread = thread_create(NULL, 0,

@@ -1162,12 +1166,13 @@ vdev_trim_restart(vdev_t *vd)

ASSERT(err == 0 || err == ENOENT);

vd->vdev_trim_action_time = timestamp;

- if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||

- vd->vdev_offline) {

+ if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||

+ vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {

/* load progress for reporting, but don't resume */

VERIFY0(vdev_trim_load(vd));

} else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&

vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&

+ !vd->vdev_top->vdev_rz_expanding &&

vd->vdev_trim_thread == NULL) {

VERIFY0(vdev_trim_load(vd));

vdev_trim(vd, vd->vdev_trim_rate,

@@ -1492,7 +1497,8 @@ vdev_autotrim(spa_t *spa)

mutex_enter(&tvd->vdev_autotrim_lock);

if (vdev_writeable(tvd) && !tvd->vdev_removing &&

- tvd->vdev_autotrim_thread == NULL) {

+ tvd->vdev_autotrim_thread == NULL &&

+ !tvd->vdev_rz_expanding) {

ASSERT3P(tvd->vdev_top, ==, tvd);

tvd->vdev_autotrim_thread = thread_create(NULL, 0,

@@ -1717,6 +1723,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)

ASSERT(vd->vdev_ops->vdev_op_leaf);

ASSERT(!vd->vdev_detached);

ASSERT(!vd->vdev_top->vdev_removing);

+ ASSERT(!vd->vdev_top->vdev_rz_expanding);

ta.trim_vdev = vd;

ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);